Merge branch 'develop' into cgroups

This commit is contained in:
Martin Kroeker 2017-07-15 10:40:42 +02:00 committed by GitHub
commit 7294fb1d9d
169 changed files with 22837 additions and 840 deletions

View File

@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
)
ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h)
ADD_CUSTOM_TARGET(genconfig
ALL
DEPENDS openblas_config.h
)
add_dependencies(genconfig ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME}
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(genf77blas
ALL
COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
)
@ -255,11 +260,11 @@ if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(gencblas
ALL
COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
)
add_dependencies(gencblas ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

View File

@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
endif
RELA =
ifeq ($(BUILD_RELAPACK), 1)
RELA = re_lapack
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
.PHONY : all libs netlib test ctest shared install
.NOTPARALLEL : all libs prof lapack-test install blas-test
.PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
all :: libs netlib tests shared
all :: libs netlib $(RELA) tests shared
@echo
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
@echo
@ -215,6 +220,14 @@ ifndef NO_LAPACKE
endif
endif
ifeq ($(NO_LAPACK), 1)
re_lapack :
else
re_lapack :
@$(MAKE) -C relapack
endif
prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
@ -326,11 +339,7 @@ endif
@touch $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
@$(MAKE) -C relapack clean
@rm -f *.grd Makefile.conf_last config_last.h
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
@echo Done.
# Makefile debugging trick:
# call print-VARIABLE to see the runtime value of any variable
print-%:
@echo '$*=$($*)'

View File

@ -1,5 +1,4 @@
#ifeq logical or
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV7)
ifeq ($(OSNAME), Android)
ifeq ($(ARM_SOFTFP_ABI), 1)
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
else
CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
endif
else
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -mfpu=vfp -march=armv6
FCOMMON_OPT += -mfpu=vfp -march=armv6
endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -marm -march=armv5
FCOMMON_OPT += -marm -march=armv5
CCOMMON_OPT += -march=armv5
FCOMMON_OPT += -march=armv5
endif

View File

@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
endif
ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
endif

View File

@ -83,6 +83,9 @@ VERSION = 0.2.20.dev
# Build LAPACK Deprecated functions since LAPACK 3.6.0
BUILD_LAPACK_DEPRECATED = 1
# Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1
# If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1
NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux.
NO_AFFINITY = 1
#NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1

View File

@ -242,6 +242,10 @@ EXTRALIB += -lm
NO_EXPRECISION = 1
endif
ifeq ($(OSNAME), Android)
EXTRALIB += -lm
endif
ifeq ($(OSNAME), AIX)
EXTRALIB += -lm
endif
@ -486,12 +490,18 @@ BINARY_DEFINED = 1
CCOMMON_OPT += -marm
FCOMMON_OPT += -marm
# If softfp abi is mentioned on the command line, force it.
ifeq ($(ARM_SOFTFP_ABI), 1)
CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
CCOMMON_OPT += -mfloat-abi=softfp
FCOMMON_OPT += -mfloat-abi=softfp
endif
ifeq ($(OSNAME), Android)
ifeq ($(ARM_SOFTFP_ABI), 1)
EXTRALIB += -lm
else
CCOMMON_OPT += -mfloat-abi=hard
FCOMMON_OPT += -mfloat-abi=hard
EXTRALIB += -Wl,-lm_hard
endif
endif
endif
@ -1119,6 +1129,9 @@ LIB_COMPONENTS += LAPACK
ifneq ($(NO_LAPACKE), 1)
LIB_COMPONENTS += LAPACKE
endif
ifeq ($(BUILD_RELAPACK), 1)
LIB_COMPONENTS += ReLAPACK
endif
endif
ifeq ($(ONLY_CBLAS), 1)

View File

@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF}
"#define __${BINARY}BIT__\t1\n"
"#define FUNDERSCORE\t${FU}\n")
if (${HOST_OS} STREQUAL "WINDOWSSTORE")
file(APPEND ${TARGET_CONF}
"#define OS_WINNT\t1\n")
endif ()

View File

@ -77,7 +77,7 @@ if (CYGWIN)
set(NO_EXPRECISION 1)
endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
if (SMP)
set(EXTRALIB "${EXTRALIB} -lpthread")
endif ()

View File

@ -72,20 +72,26 @@ if (MSVC)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
endif()
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
# disable WindowsStore strict CRT checks
set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
endif ()
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH_DIR})
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
if (NOT ${GETARCH_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
if (NOT ${GETARCH_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
endif ()
endif ()
message(STATUS "Running getarch")
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH2_DIR})
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
)
if (NOT ${GETARCH2_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
if (NOT ${GETARCH2_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
endif ()
endif ()
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})
try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GEN_CONFIG_H_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GEN_CONFIG_H_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
)
if (NOT ${GEN_CONFIG_H_RESULT})
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
endif ()
if (NOT ${GEN_CONFIG_H_RESULT})
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
endif ()
endif ()

View File

@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#endif
#ifndef ASSEMBLER
#ifdef OS_WINDOWSSTORE
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0
#else
#ifdef OS_WINDOWS
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
typedef char* env_var_t;
#define readenv(p, n) ((p)=getenv(n))
#endif
#endif
#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
#ifdef _POSIX_MONOTONIC_CLOCK
@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
*address = 0;
}
#ifdef OS_WINDOWSSTORE
static __inline int readenv_atoi(char *env) {
return 0;
}
#else
#ifdef OS_WINDOWS
static __inline int readenv_atoi(char *env) {
env_var_t p;
@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
return(0);
}
#endif
#endif
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)

View File

@ -111,11 +111,6 @@ REALNAME:
#define PROFCODE
#ifdef __ARM_PCS
//-mfloat-abi=softfp
#define SOFT_FLOAT_ABI
#endif
#endif

View File

@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#endif
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
blas_queue_t queue[MAX_CPU_NUMBER + 1];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];

View File

@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)i;
width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)(arg -> n - i);
width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;

View File

@ -12,6 +12,8 @@ if (SMP)
set(BLAS_SERVER blas_server_omp.c)
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(BLAS_SERVER blas_server_win32.c)
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
set(BLAS_SERVER blas_server_win32.c)
endif ()
if (NOT DEFINED BLAS_SERVER)

View File

@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
TerminateThread(blas_threads[i],0);
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0);
#endif
}
blas_server_avail = 0;

View File

@ -825,10 +825,11 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) {
if (common -> magic != SH_MAGIC)
cpu_set_t *cpusetp;
int nums;
int ret;
#ifdef DEBUG
fprintf(stderr, "Shared Memory Initialization.\n");
#endif
@ -883,7 +884,7 @@ void gotoblas_affinity_init(void) {
if (common -> num_nodes > 1) numa_mapping();
common -> final_num_procs = 0;
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;

View File

@ -1,7 +1,5 @@
include $(KERNELDIR)/KERNEL.ARMV5
###############################################################################
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c
SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S
CNRM2KERNEL = nrm2_vfp.S
ZNRM2KERNEL = nrm2_vfp.S
SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SSCALKERNEL = scal.c
DSCALKERNEL = scal.c
CSCALKERNEL = zscal.c
ZSCALKERNEL = zscal.c
SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x2_vfp.S
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = sgemm_ncopy_4_vfp.S
SGEMMITCOPY = sgemm_tcopy_4_vfp.S
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
endif
SGEMMONCOPY = sgemm_ncopy_2_vfp.S
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
DGEMMINCOPY = dgemm_ncopy_4_vfp.S
DGEMMITCOPY = dgemm_tcopy_4_vfp.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
endif
DGEMMONCOPY = dgemm_ncopy_2_vfp.S
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
STRMMKERNEL = strmm_kernel_4x2_vfp.S
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S

View File

@ -1,91 +1,12 @@
#################################################################################
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
ZAMAXKERNEL = iamax_vfp.S
SAMINKERNEL = iamax_vfp.S
DAMINKERNEL = iamax_vfp.S
CAMINKERNEL = iamax_vfp.S
ZAMINKERNEL = iamax_vfp.S
SMAXKERNEL = iamax_vfp.S
DMAXKERNEL = iamax_vfp.S
SMINKERNEL = iamax_vfp.S
DMINKERNEL = iamax_vfp.S
ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S
ICAMAXKERNEL = iamax_vfp.S
IZAMAXKERNEL = iamax_vfp.S
ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S
ICAMINKERNEL = iamax_vfp.S
IZAMINKERNEL = iamax_vfp.S
ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S
ISMINKERNEL = iamax_vfp.S
IDMINKERNEL = iamax_vfp.S
SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
ZSWAPKERNEL = swap_vfp.S
SASUMKERNEL = asum_vfp.S
DASUMKERNEL = asum_vfp.S
CASUMKERNEL = asum_vfp.S
ZASUMKERNEL = asum_vfp.S
SAXPYKERNEL = axpy_vfp.S
DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
CDOTKERNEL = cdot_vfp.S
ZDOTKERNEL = zdot_vfp.S
include $(KERNELDIR)/KERNEL.ARMV6
SNRM2KERNEL = nrm2_vfpv3.S
DNRM2KERNEL = nrm2_vfpv3.S
CNRM2KERNEL = nrm2_vfpv3.S
ZNRM2KERNEL = nrm2_vfpv3.S
SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SSCALKERNEL = scal.c
DSCALKERNEL = scal.c
CSCALKERNEL = zscal.c
ZSCALKERNEL = zscal.c
SGEMVNKERNEL = gemv_n_vfpv3.S
DGEMVNKERNEL = gemv_n_vfpv3.S
CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S
SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S
ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S

View File

@ -475,6 +475,14 @@ asum_kernel_L999:
vadd.f32 s0 , s0, s1 // set return value
#endif
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov r0, s0
#else
vmov r0, r1, d0
#endif
#endif
bx lr
EPILOGUE

View File

@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#ifndef ARM_SOFTFP_ABI
//hard
#define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ]
#else
#if !defined(__ARM_PCS_VFP)
#if !defined(COMPLEX)
#if !defined(DOUBLE)
#define OLD_ALPHA r3
#define OLD_X [fp, #0 ]
#define OLD_INC_X [fp, #4 ]
#define OLD_Y [fp, #8 ]
#define OLD_INC_Y [fp, #12 ]
#else
#define OLD_ALPHA [fp, #0]
#define OLD_X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define OLD_Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#endif
#else //COMPLEX
#if !defined(DOUBLE)
#define OLD_ALPHAR r3
#define OLD_ALPHAI [fp, #0 ]
#define OLD_X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define OLD_Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#else
#define OLD_ALPHAR [fp, #0]
#define OLD_ALPHAI [fp, #8]
#define OLD_X [fp, #16 ]
#define OLD_INC_X [fp, #20 ]
#define OLD_Y [fp, #24 ]
#define OLD_INC_Y [fp, #28 ]
#endif
#endif //!defined(COMPLEX)
#else //__ARM_PCS_VFP
#define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ]
#endif //!defined(__ARM_PCS_VFP)
#define N r0
#define Y r1
#define INC_X r2
@ -71,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE)
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#else
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
@ -90,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #8
sub sp, sp, #STACKSIZE // reserve stack
#ifdef ARM_SOFTFP_ABI
#ifndef DOUBLE
vmov s0, r3 //move alpha to s0
#if !defined(__ARM_PCS_VFP)
#if !defined(COMPLEX)
#if !defined(DOUBLE)
vmov s0, OLD_ALPHA
ldr X, OLD_X
#else
vldr d0, OLD_ALPHA
ldr X, OLD_X
#endif
#else //COMPLEX
#if !defined(DOUBLE)
vmov s0, OLD_ALPHAR
vldr s1, OLD_ALPHAI
ldr X, OLD_X
#else
vldr d0, OLD_ALPHAR
vldr d1, OLD_ALPHAI
ldr X, OLD_X
#endif
#endif
#endif
ldr INC_X , OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y , OLD_INC_Y

View File

@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_RETURN_ADDR r0
#define OLD_N r1
#define OLD_X r2
#define OLD_INC_X r3
#define OLD_Y [fp, #0 ]
#define OLD_INC_Y [fp, #4 ]
#define RETURN_ADDR r8
#else
#define OLD_Y r3
#define OLD_INC_Y [fp, #0 ]
#endif
#define I r5
#define Y r6
@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5
push {r4 - r9, fp}
add fp, sp, #24
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmov s2, s0
vmov s3, s0
#if !defined(__ARM_PCS_VFP)
mov RETURN_ADDR, OLD_RETURN_ADDR
mov N, OLD_N
mov X, OLD_X
mov INC_X, OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y, OLD_INC_Y
#else
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
#endif
cmp N, #0
ble cdot_kernel_L999
@ -265,7 +283,6 @@ cdot_kernel_S10:
cdot_kernel_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
@ -276,8 +293,11 @@ cdot_kernel_L999:
vadd.f32 s0 , s0, s2
vsub.f32 s1 , s1, s3
#endif
#if !defined(__ARM_PCS_VFP)
vstm RETURN_ADDR, {s0 - s1}
#endif
sub sp, fp, #24
sub sp, fp, #28
pop {r4 - r9, fp}
bx lr

View File

@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CN) || defined(CT)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
#define FMAC_R1 fnmacs
#define FMAC_R2 fnmacs
#define FMAC_R1 vmls.f32
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fnmacs
#define FMAC_I2 vmls.f32
#elif defined(CN) || defined(CT)
@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
#define FMAC_R1 fnmacs
#define FMAC_R1 vmls.f32
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 vmls.f32
#endif
@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR r3
#define OLD_ALPHAI [fp, #0 ]
#define OLD_A_SOFTFP [fp, #4 ]
#define OLD_LDA [fp, #8 ]
#define X [fp, #12 ]
#define OLD_INC_X [fp, #16 ]
#define Y [fp, #20 ]
#define OLD_INC_Y [fp, #24 ]
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_M r0
@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble cgemvn_kernel_L999
#if !defined(__ARM_PCS_VFP)
vmov s0, OLD_ALPHAR
vldr s1, OLD_ALPHAI
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
str OLD_M, M
vstr s0 , ALPHA_R

View File

@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR r3
#define OLD_ALPHAI [fp, #0 ]
#define OLD_A_SOFTFP [fp, #4 ]
#define OLD_LDA [fp, #8 ]
#define X [fp, #12 ]
#define OLD_INC_X [fp, #16 ]
#define Y [fp, #20 ]
#define OLD_INC_Y [fp, #24 ]
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_N r1
@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0
ble cgemvt_kernel_L999
#if !defined(__ARM_PCS_VFP)
vmov s0, OLD_ALPHAR
vldr s1, OLD_ALPHAI
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
str OLD_N, N

View File

@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#define OFFSET [fp, #24 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CN) || defined(CT)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
#define KMAC_R fnmacs
#define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#define OFFSET [fp, #24 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
#define FMAC_R1 fnmuls
#define FMAC_R2 fnmacs
#define FMAC_R1 vnmul.f32
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmuls
#define FMAC_I2 fnmacs
#define FMAC_I2 vmls.f32
#elif defined(CN) || defined(CT)
@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmuls
#define FMAC_R2 fmacs
#define FMAC_I1 fnmuls
#define FMAC_I1 vnmul.f32
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubs
#define FMAC_R1 fmuls
#define FMAC_R2 fnmacs
#define FMAC_R2 vmls.f32
#define FMAC_I1 fmuls
#define FMAC_I2 fmacs
@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
#define FMAC_R1 fnmuls
#define FMAC_R1 vnmul.f32
#define FMAC_R2 fmacs
#define FMAC_I1 fnmuls
#define FMAC_I2 fnmacs
#define FMAC_I1 vnmul.f32
#define FMAC_I2 vmls.f32
#endif
@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -246,6 +246,9 @@ ddot_kernel_L999:
vldm r3, { d8 - d15} // restore floating point registers
vadd.f64 d0 , d0, d1 // set return value
#if !defined(__ARM_PCS_VFP)
vmov r0, r1, d0
#endif
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

View File

@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define OLD_C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#define OFFSET [fp, #28 ]
#else
#define B [fp, #4 ]
#define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define OLD_C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#define OFFSET [fp, #28 ]
#else
#define B [fp, #4 ]
#define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
#define OLD_ALPHA r3
#define OLD_A_SOFTFP [fp, #0 ]
#define OLD_LDA [fp, #4 ]
#define X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#else
#define OLD_ALPHA [fp, #0 ]
#define OLD_A_SOFTFP [fp, #8 ]
#define OLD_LDA [fp, #12]
#define X [fp, #16]
#define OLD_INC_X [fp, #20]
#define Y [fp, #24]
#define OLD_INC_Y [fp, #28]
#endif
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_M r0
@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble gemvn_kernel_L999
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov s0, OLD_ALPHA
#else
vldr d0, OLD_ALPHA
#endif
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
str OLD_M, M

View File

@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#ifndef ARM_SOFTFP_ABI
//hard
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#else
#define OLD_A_SOFTFP [fp, #0 ]
#define OLD_LDA [fp, #4 ]
#define X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
#define OLD_ALPHA r3
#define OLD_A r3
#define OLD_A_SOFTFP [fp, #0 ]
#define OLD_LDA [fp, #4 ]
#define X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#else
#define OLD_ALPHA [fp, #0 ]
#define OLD_A_SOFTFP [fp, #8 ]
#define OLD_LDA [fp, #12]
#define X [fp, #16]
#define OLD_INC_X [fp, #20]
#define Y [fp, #24]
#define OLD_INC_Y [fp, #28]
#endif
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_M r0
#define AO1 r0
@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble gemvn_kernel_L999
#ifndef DOUBLE
#ifdef ARM_SOFTFP_ABI
vmov s0, OLD_ALPHA
ldr OLD_A, OLD_A_SOFTFP
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov s0, OLD_ALPHA
#else
vldr d0, OLD_ALPHA
#endif
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
str OLD_M, M
ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y

View File

@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#ifndef ARM_SOFTFP_ABI
//hard abi
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#else
#define OLD_A_SOFTFP [fp, #0 ]
#define OLD_LDA [fp, #4 ]
#define X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
#define OLD_ALPHA r3
#define OLD_A r3
#define OLD_A_SOFTFP [fp, #0 ]
#define OLD_LDA [fp, #4 ]
#define X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#else
#define OLD_ALPHA [fp, #0 ]
#define OLD_A_SOFTFP [fp, #8 ]
#define OLD_LDA [fp, #12]
#define X [fp, #16]
#define OLD_INC_X [fp, #20]
#define Y [fp, #24]
#define OLD_INC_Y [fp, #28]
#endif
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_N r1
#define M r0
@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0
ble gemvt_kernel_L999
#ifndef DOUBLE
#ifdef ARM_SOFTFP_ABI
vmov s0, OLD_ALPHA
ldr OLD_A, OLD_A_SOFTFP
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov s0, OLD_ALPHA
#else
vldr d0, OLD_ALPHA
#endif
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A

View File

@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
#define OLD_ALPHA r3
#define OLD_A_SOFTFP [fp, #0 ]
#define OLD_LDA [fp, #4 ]
#define X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#else
#define OLD_ALPHA [fp, #0 ]
#define OLD_A_SOFTFP [fp, #8 ]
#define OLD_LDA [fp, #12]
#define X [fp, #16]
#define OLD_INC_X [fp, #20]
#define Y [fp, #24]
#define OLD_INC_Y [fp, #28]
#endif
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_N r1
@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0
ble gemvt_kernel_L999
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov s0, OLD_ALPHA
#else
vldr d0, OLD_ALPHA
#endif
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
str OLD_N, N

View File

@ -573,6 +573,13 @@ nrm2_kernel_L999:
#else
vsqrt.f32 s1, s1
vmul.f32 s0, s0, s1
#endif
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov r0, s0
#else
vmov r0, r1, d0
#endif
#endif
bx lr

View File

@ -503,8 +503,13 @@ nrm2_kernel_L999:
#else
vsqrt.f32 s1, s1
vmul.f32 s0, s0, s1
#ifdef ARM_SOFTFP_ABI
vmov r0, s0
#endif
#if !defined(__ARM_PCS_VFP)
#if defined(DOUBLE)
vmov r0, r1, d0
#else
vmov r0, s0
#endif
#endif

View File

@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OLD_INC_Y [fp, #0 ]
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
#define OLD_C [fp, #4]
#define OLD_S [fp, #8]
#else
#define OLD_C [fp, #8]
#define OLD_S [fp, #16]
#endif
#endif
#define N r0
#define X r1
@ -73,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@ -82,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@ -91,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@ -100,7 +109,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@ -114,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X, { d2 }
fstmiad Y, { d3 }
@ -145,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
@ -154,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
@ -163,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
@ -172,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
@ -186,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
@ -199,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X, { s2 }
fstmias Y, { s3 }
@ -226,13 +235,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
vmls.f64 d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@ -241,13 +250,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
vmls.f64 d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@ -259,13 +268,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
vmls.f64 d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@ -274,13 +283,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
vmls.f64 d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@ -294,13 +303,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
vmls.f64 d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@ -314,13 +323,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
vmls.f64 d3 , d1, d4
vstr d2 , [ X, #0 ]
vstr d3 , [ Y, #0 ]
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
vmls.f64 d3 , d1, d5
vstr d2 , [ X, #8 ]
vstr d3 , [ Y, #8 ]
@ -343,13 +352,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
vmls.f32 s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
@ -358,13 +367,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
vmls.f32 s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
@ -376,13 +385,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
vmls.f32 s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
@ -391,13 +400,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
vmls.f32 s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
@ -411,13 +420,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
vmls.f32 s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
@ -431,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
vmls.f32 s3 , s1, s4
vstr s2 , [ X, #0 ]
vstr s3 , [ Y, #0 ]
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
vmls.f32 s3 , s1, s5
vstr s2 , [ X, #4 ]
vstr s3 , [ Y, #4 ]
@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #8
ldr INC_Y , OLD_INC_Y
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vldr s0, OLD_C
vldr s1, OLD_S
#else
vldr d0, OLD_C
vldr d1, OLD_S
#endif
#endif
cmp N, #0
ble rot_kernel_L999

View File

@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X, { d2 - d3 }
@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X, { s2 - s3 }

View File

@ -329,20 +329,19 @@ sdot_kernel_L999:
vldm r3, { s8 - s15} // restore floating point registers
#if defined(DSDOT)
vadd.f64 d0 , d0, d1 // set return value
#ifdef ARM_SOFTFP_ABI
vmov r0, r1, d0
#else
vadd.f32 s0 , s0, s1 // set return value
#endif
#if !defined(__ARM_PCS_VFP)
#if defined(DSDOT)
vmov r0, r1, d0
#else
vadd.f32 s0 , s0, s1 // set return value
#ifdef ARM_SOFTFP_ABI
vmov r0, s0
#endif
#endif
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

View File

@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP r3
#define OLD_A_SOFTFP [fp, #4 ]
#define B [fp, #8 ]
#define C [fp, #12 ]
#define OLD_LDC [fp, #16 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OLD_M r0
#define OLD_N r1
#define OLD_K r2
#ifdef ARM_SOFTFP_ABI
#define OLD_ALPHA r3
//#define OLD_A
#else //hard
#define OLD_A r3
#define OLD_ALPHA s0
#endif
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define M [fp, #-256 ]
#define N [fp, #-260 ]
#define K [fp, #-264 ]
#ifndef ARM_SOFTFP_ABI
#define A [fp, #-268 ]
#endif
#define FP_ZERO [fp, #-240]
#define FP_ZERO_0 [fp, #-240]
@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
#ifdef ARM_SOFTFP_ABI
#define A [fp, #4 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP r3
#define OLD_A_SOFTFP [fp, #4 ]
#define B [fp, #8 ]
#define C [fp, #12 ]
#define OLD_LDC [fp, #16 ]
#else //hard
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
#define L r2
@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
#ifdef ARM_SOFTFP_ABI
str OLD_ALPHA, ALPHA
#else //hard
str OLD_A, A
vstr OLD_ALPHA, ALPHA
#endif
sub r3, fp, #128
vstm r3, { s8 - s31} // store floating point registers

View File

@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP r3
#define OLD_A_SOFTFP [fp, #4 ]
#define B [fp, #8 ]
#define OLD_C [fp, #12 ]
#define OLD_LDC [fp, #16 ]
#define OFFSET [fp, #20 ]
#else
#define B [fp, #4 ]
#define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP r3
#define OLD_A_SOFTFP [fp, #4 ]
#define B [fp, #8 ]
#define C [fp, #12 ]
#define OLD_LDC [fp, #16 ]
#define OFFSET [fp, #20 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#if !defined(__ARM_PCS_VFP)
#if !defined(COMPLEX)
#if !defined(DOUBLE)
#define OLD_X [fp, #0 ]
#define OLD_INC_X [fp, #4 ]
#define OLD_Y [fp, #8 ]
#define OLD_INC_Y [fp, #12 ]
#else
#define OLD_X [fp, #8 ]
#define OLD_INC_X [fp, #12]
#define OLD_Y [fp, #16]
#define OLD_INC_Y [fp, #20]
#endif
#else //COMPLEX
#if !defined(DOUBLE)
#define OLD_X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define OLD_Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#else
#define OLD_X [fp, #16]
#define OLD_INC_X [fp, #20]
#define OLD_Y [fp, #24]
#define OLD_INC_Y [fp, #28]
#endif
#endif // !defined(__ARM_PCS_VFP)
#else
#define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ]
#endif
#define N r0
@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
push {r4 , fp}
add fp, sp, #8
#if !defined(__ARM_PCS_VFP)
ldr X, OLD_X
#endif
ldr INC_X , OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y , OLD_INC_Y

View File

@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_RETURN_ADDR r0
#define OLD_N r1
#define OLD_X r2
#define OLD_INC_X r3
#define OLD_Y [fp, #0 ]
#define OLD_INC_Y [fp, #4 ]
#define RETURN_ADDR r8
#else
#define OLD_Y r3
#define OLD_INC_Y [fp, #0 ]
#endif
#define I r5
#define Y r6
@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5
push {r4 - r9, fp}
add fp, sp, #24
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vcvt.f64.f32 d2, s0
vcvt.f64.f32 d3, s0
#if !defined(__ARM_PCS_VFP)
mov RETURN_ADDR, OLD_RETURN_ADDR
mov N, OLD_N
mov X, OLD_X
mov INC_X, OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y, OLD_INC_Y
#else
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
#endif
cmp N, #0
ble zdot_kernel_L999
@ -280,8 +297,11 @@ zdot_kernel_L999:
vadd.f64 d0 , d0, d2
vsub.f64 d1 , d1, d3
#endif
#if !defined(__ARM_PCS_VFP)
vstm RETURN_ADDR, {d0 - d1}
#endif
sub sp, fp, #24
sub sp, fp, #28
pop {r4 - r9, fp}
bx lr

View File

@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP [fp, #4]
#define OLD_ALPHAI_SOFTFP [fp, #12]
#define OLD_A_SOFTFP [fp, #20 ]
#define B [fp, #24 ]
#define C [fp, #28 ]
#define OLD_LDC [fp, #32 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -87,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacd
#define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(CN) || defined(CT)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
#define KMAC_R fnmacd
#define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#endif
@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP [fp, #4]
#define OLD_ALPHAI_SOFTFP [fp, #12]
#define OLD_A_SOFTFP [fp, #20 ]
#define B [fp, #24 ]
#define C [fp, #28 ]
#define OLD_LDC [fp, #32 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
#define I r0
#define J r1
@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd
#define FADD_I faddd
#define FMAC_R1 fnmacd
#define FMAC_R2 fnmacd
#define FMAC_R1 vmls.f64
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fnmacd
#define FMAC_I2 vmls.f64
#elif defined(CN) || defined(CT)
@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubd
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd
#define FADD_I faddd
#define FMAC_R1 fnmacd
#define FMAC_R1 vmls.f64
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 vmls.f64
#endif
@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR [fp, #0 ]
#define OLD_ALPHAI [fp, #8 ]
#define OLD_A_SOFTFP [fp, #16]
#define OLD_LDA [fp, #20]
#define X [fp, #24]
#define OLD_INC_X [fp, #28]
#define Y [fp, #32]
#define OLD_INC_Y [fp, #36]
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_M r0
@ -79,42 +91,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacd
#define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
#define KMAC_R fnmacd
#define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#endif
@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble zgemvn_kernel_L999
#if !defined(__ARM_PCS_VFP)
vldr d0, OLD_ALPHAR
vldr d1, OLD_ALPHAI
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
str OLD_M, M
vstr d0 , ALPHA_R

View File

@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR [fp, #0 ]
#define OLD_ALPHAI [fp, #8 ]
#define OLD_A_SOFTFP [fp, #16]
#define OLD_LDA [fp, #20]
#define X [fp, #24]
#define OLD_INC_X [fp, #28]
#define Y [fp, #32]
#define OLD_INC_Y [fp, #36]
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_N r1
@ -77,42 +89,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacd
#define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
#define KMAC_R fnmacd
#define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#endif
@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0
ble zgemvt_kernel_L999
#if !defined(__ARM_PCS_VFP)
vldr d0, OLD_ALPHAR
vldr d1, OLD_ALPHAI
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
str OLD_N, N

View File

@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP [fp, #4]
#define OLD_ALPHAI_SOFTFP [fp, #12]
#define OLD_A_SOFTFP [fp, #20 ]
#define B [fp, #24 ]
#define C [fp, #28 ]
#define OLD_LDC [fp, #32 ]
#define OFFSET [fp, #36 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -96,42 +106,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacd
#define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(CN) || defined(CT)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
#define KMAC_R fnmacd
#define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#endif
@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP [fp, #4]
#define OLD_ALPHAI_SOFTFP [fp, #12]
#define OLD_A_SOFTFP [fp, #20 ]
#define B [fp, #24 ]
#define C [fp, #28 ]
#define OLD_LDC [fp, #32 ]
#define OFFSET [fp, #36 ]
#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
#endif
#define I r0
#define J r1
@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd
#define FADD_I faddd
#define FMAC_R1 fnmuld
#define FMAC_R2 fnmacd
#define FMAC_R1 vnmul.f64
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmuld
#define FMAC_I2 fnmacd
#define FMAC_I2 vmls.f64
#elif defined(CN) || defined(CT)
@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmuld
#define FMAC_R2 fmacd
#define FMAC_I1 fnmuld
#define FMAC_I1 vnmul.f64
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubd
#define FMAC_R1 fmuld
#define FMAC_R2 fnmacd
#define FMAC_R2 vmls.f64
#define FMAC_I1 fmuld
#define FMAC_I2 fmacd
@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd
#define FADD_I faddd
#define FMAC_R1 fnmuld
#define FMAC_R1 vnmul.f64
#define FMAC_R2 fmacd
#define FMAC_I1 fnmuld
#define FMAC_I2 fnmacd
#define FMAC_I1 vnmul.f64
#define FMAC_I2 vmls.f64
#endif
@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K

View File

@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x)
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %8, %2 \n\t"
"lxvw4x 42, %9, %2 \n\t"
"lxvw4x 43, %10, %2 \n\t"
"lxvw4x 44, %11, %2 \n\t"
"lxvw4x 45, %12, %2 \n\t"
"lxvw4x 46, %13, %2 \n\t"
"lxvw4x 47, %14, %2 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %8, %2 \n\t"
"lxvd2x 42, %9, %2 \n\t"
"lxvd2x 43, %10, %2 \n\t"
"lxvd2x 44, %11, %2 \n\t"
"lxvd2x 45, %12, %2 \n\t"
"lxvd2x 46, %13, %2 \n\t"
"lxvd2x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t"
@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x)
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %8, %2 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %8, %2 \n\t"
"xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t"
"lxvw4x 42, %9, %2 \n\t"
"lxvw4x 43, %10, %2 \n\t"
"lxvd2x 42, %9, %2 \n\t"
"lxvd2x 43, %10, %2 \n\t"
"xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t"
"lxvw4x 44, %11, %2 \n\t"
"lxvw4x 45, %12, %2 \n\t"
"lxvd2x 44, %11, %2 \n\t"
"lxvd2x 45, %12, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
"lxvw4x 46, %13, %2 \n\t"
"lxvw4x 47, %14, %2 \n\t"
"lxvd2x 46, %13, %2 \n\t"
"lxvd2x 47, %14, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"

View File

@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
{
__asm__
(
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %5, %2 \n\t"
"lxvw4x 34, %6, %2 \n\t"
"lxvw4x 35, %7, %2 \n\t"
"lxvw4x 36, %8, %2 \n\t"
"lxvw4x 37, %9, %2 \n\t"
"lxvw4x 38, %10, %2 \n\t"
"lxvw4x 39, %11, %2 \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %5, %2 \n\t"
"lxvd2x 34, %6, %2 \n\t"
"lxvd2x 35, %7, %2 \n\t"
"lxvd2x 36, %8, %2 \n\t"
"lxvd2x 37, %9, %2 \n\t"
"lxvd2x 38, %10, %2 \n\t"
"lxvd2x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
"stxvw4x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t"
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %5, %2 \n\t"
"stxvw4x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t"
"lxvw4x 34, %6, %2 \n\t"
"lxvw4x 35, %7, %2 \n\t"
"stxvw4x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t"
"lxvw4x 36, %8, %2 \n\t"
"lxvw4x 37, %9, %2 \n\t"
"stxvw4x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t"
"lxvw4x 38, %10, %2 \n\t"
"lxvw4x 39, %11, %2 \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %5, %2 \n\t"
"stxvd2x 34, %6, %3 \n\t"
"stxvd2x 35, %7, %3 \n\t"
"lxvd2x 34, %6, %2 \n\t"
"lxvd2x 35, %7, %2 \n\t"
"stxvd2x 36, %8, %3 \n\t"
"stxvd2x 37, %9, %3 \n\t"
"lxvd2x 36, %8, %2 \n\t"
"lxvd2x 37, %9, %2 \n\t"
"stxvd2x 38, %10, %3 \n\t"
"stxvd2x 39, %11, %3 \n\t"
"lxvd2x 38, %10, %2 \n\t"
"lxvd2x 39, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
"stxvw4x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"stxvw4x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"stxvw4x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"stxvw4x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %3 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %3 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %3 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
"2: \n\t"
"stxvw4x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t"
"stxvw4x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t"
"stxvw4x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t"
"stxvw4x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"
"stxvd2x 34, %6, %3 \n\t"
"stxvd2x 35, %7, %3 \n\t"
"stxvd2x 36, %8, %3 \n\t"
"stxvd2x 37, %9, %3 \n\t"
"stxvd2x 38, %10, %3 \n\t"
"stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvw4x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t"
"stxvw4x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t"
"stxvw4x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t"
"stxvw4x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %3 \n\t"
"stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %3 \n\t"
"stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
:

View File

@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
"lxvw4x 32, 0, %4 \n\t"
"lxvw4x 33, %5, %4 \n\t"
"lxvw4x 34, %6, %4 \n\t"
"lxvw4x 35, %7, %4 \n\t"
"lxvw4x 36, %8, %4 \n\t"
"lxvw4x 37, %9, %4 \n\t"
"lxvw4x 38, %10, %4 \n\t"
"lxvw4x 39, %11, %4 \n\t"
"lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t"
"lxvd2x 34, %6, %4 \n\t"
"lxvd2x 35, %7, %4 \n\t"
"lxvd2x 36, %8, %4 \n\t"
"lxvd2x 37, %9, %4 \n\t"
"lxvd2x 38, %10, %4 \n\t"
"lxvd2x 39, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
"lxvw4x 40, 0, %4 \n\t"
"lxvw4x 41, %5, %4 \n\t"
"lxvw4x 42, %6, %4 \n\t"
"lxvw4x 43, %7, %4 \n\t"
"lxvw4x 44, %8, %4 \n\t"
"lxvw4x 45, %9, %4 \n\t"
"lxvw4x 46, %10, %4 \n\t"
"lxvw4x 47, %11, %4 \n\t"
"lxvd2x 40, 0, %4 \n\t"
"lxvd2x 41, %5, %4 \n\t"
"lxvd2x 42, %6, %4 \n\t"
"lxvd2x 43, %7, %4 \n\t"
"lxvd2x 44, %8, %4 \n\t"
"lxvd2x 45, %9, %4 \n\t"
"lxvd2x 46, %10, %4 \n\t"
"lxvd2x 47, %11, %4 \n\t"
"addi %4, %4, -128 \n\t"
"lxvw4x 48, 0, %3 \n\t"
"lxvw4x 49, %5, %3 \n\t"
"lxvw4x 50, %6, %3 \n\t"
"lxvw4x 51, %7, %3 \n\t"
"lxvw4x 0, %8, %3 \n\t"
"lxvw4x 1, %9, %3 \n\t"
"lxvw4x 2, %10, %3 \n\t"
"lxvw4x 3, %11, %3 \n\t"
"lxvd2x 48, 0, %3 \n\t"
"lxvd2x 49, %5, %3 \n\t"
"lxvd2x 50, %6, %3 \n\t"
"lxvd2x 51, %7, %3 \n\t"
"lxvd2x 0, %8, %3 \n\t"
"lxvd2x 1, %9, %3 \n\t"
"lxvd2x 2, %10, %3 \n\t"
"lxvd2x 3, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"lxvw4x 4, 0, %3 \n\t"
"lxvw4x 5, %5, %3 \n\t"
"lxvw4x 6, %6, %3 \n\t"
"lxvw4x 7, %7, %3 \n\t"
"lxvw4x 8, %8, %3 \n\t"
"lxvw4x 9, %9, %3 \n\t"
"lxvw4x 10, %10, %3 \n\t"
"lxvw4x 11, %11, %3 \n\t"
"lxvd2x 4, 0, %3 \n\t"
"lxvd2x 5, %5, %3 \n\t"
"lxvd2x 6, %6, %3 \n\t"
"lxvd2x 7, %7, %3 \n\t"
"lxvd2x 8, %8, %3 \n\t"
"lxvd2x 9, %9, %3 \n\t"
"lxvd2x 10, %10, %3 \n\t"
"lxvd2x 11, %11, %3 \n\t"
"addi %3, %3, -128 \n\t"
"stxvw4x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t"
"stxvw4x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t"
"stxvw4x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t"
"stxvw4x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"
"stxvd2x 34, %6, %3 \n\t"
"stxvd2x 35, %7, %3 \n\t"
"stxvd2x 36, %8, %3 \n\t"
"stxvd2x 37, %9, %3 \n\t"
"stxvd2x 38, %10, %3 \n\t"
"stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvw4x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t"
"stxvw4x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t"
"stxvw4x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t"
"stxvw4x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %3 \n\t"
"stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %3 \n\t"
"stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvw4x 48, 0, %4 \n\t"
"stxvw4x 49, %5, %4 \n\t"
"stxvw4x 50, %6, %4 \n\t"
"stxvw4x 51, %7, %4 \n\t"
"stxvw4x 0, %8, %4 \n\t"
"stxvw4x 1, %9, %4 \n\t"
"stxvw4x 2, %10, %4 \n\t"
"stxvw4x 3, %11, %4 \n\t"
"stxvd2x 48, 0, %4 \n\t"
"stxvd2x 49, %5, %4 \n\t"
"stxvd2x 50, %6, %4 \n\t"
"stxvd2x 51, %7, %4 \n\t"
"stxvd2x 0, %8, %4 \n\t"
"stxvd2x 1, %9, %4 \n\t"
"stxvd2x 2, %10, %4 \n\t"
"stxvd2x 3, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
"stxvw4x 4, 0, %4 \n\t"
"stxvw4x 5, %5, %4 \n\t"
"stxvw4x 6, %6, %4 \n\t"
"stxvw4x 7, %7, %4 \n\t"
"stxvw4x 8, %8, %4 \n\t"
"stxvw4x 9, %9, %4 \n\t"
"stxvw4x 10, %10, %4 \n\t"
"stxvw4x 11, %11, %4 \n\t"
"stxvd2x 4, 0, %4 \n\t"
"stxvd2x 5, %5, %4 \n\t"
"stxvd2x 6, %6, %4 \n\t"
"stxvd2x 7, %7, %4 \n\t"
"stxvd2x 8, %8, %4 \n\t"
"stxvd2x 9, %9, %4 \n\t"
"stxvd2x 10, %10, %4 \n\t"
"stxvd2x 11, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"

View File

@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %8, %2 \n\t"
"lxvw4x 42, %9, %2 \n\t"
"lxvw4x 43, %10, %2 \n\t"
"lxvw4x 44, %11, %2 \n\t"
"lxvw4x 45, %12, %2 \n\t"
"lxvw4x 46, %13, %2 \n\t"
"lxvw4x 47, %14, %2 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %8, %2 \n\t"
"lxvd2x 42, %9, %2 \n\t"
"lxvd2x 43, %10, %2 \n\t"
"lxvd2x 44, %11, %2 \n\t"
"lxvd2x 45, %12, %2 \n\t"
"lxvd2x 46, %13, %2 \n\t"
"lxvd2x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t"
@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %8, %2 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %8, %2 \n\t"
"xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t"
"lxvw4x 42, %9, %2 \n\t"
"lxvw4x 43, %10, %2 \n\t"
"lxvd2x 42, %9, %2 \n\t"
"lxvd2x 43, %10, %2 \n\t"
"xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t"
"lxvw4x 44, %11, %2 \n\t"
"lxvw4x 45, %12, %2 \n\t"
"lxvd2x 44, %11, %2 \n\t"
"lxvd2x 45, %12, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
"lxvw4x 46, %13, %2 \n\t"
"lxvw4x 47, %14, %2 \n\t"
"lxvd2x 46, %13, %2 \n\t"
"lxvd2x 47, %14, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"

View File

@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
{
__asm__
(
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
"stxvw4x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"stxvw4x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"stxvw4x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"stxvw4x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %3 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %3 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %3 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
"2: \n\t"
"stxvw4x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t"
"stxvw4x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t"
"stxvw4x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t"
"stxvw4x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %3 \n\t"
"stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %3 \n\t"
"stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
:

View File

@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 48, 0, %3 \n\t"
"lxvw4x 41, %10, %2 \n\t"
"lxvw4x 49, %10, %3 \n\t"
"lxvw4x 42, %11, %2 \n\t"
"lxvw4x 50, %11, %3 \n\t"
"lxvw4x 43, %12, %2 \n\t"
"lxvw4x 51, %12, %3 \n\t"
"lxvw4x 44, %13, %2 \n\t"
"lxvw4x %x4, %13, %3 \n\t"
"lxvw4x 45, %14, %2 \n\t"
"lxvw4x %x5, %14, %3 \n\t"
"lxvw4x 46, %15, %2 \n\t"
"lxvw4x %x6, %15, %3 \n\t"
"lxvw4x 47, %16, %2 \n\t"
"lxvw4x %x7, %16, %3 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 48, 0, %3 \n\t"
"lxvd2x 41, %10, %2 \n\t"
"lxvd2x 49, %10, %3 \n\t"
"lxvd2x 42, %11, %2 \n\t"
"lxvd2x 50, %11, %3 \n\t"
"lxvd2x 43, %12, %2 \n\t"
"lxvd2x 51, %12, %3 \n\t"
"lxvd2x 44, %13, %2 \n\t"
"lxvd2x %x4, %13, %3 \n\t"
"lxvd2x 45, %14, %2 \n\t"
"lxvd2x %x5, %14, %3 \n\t"
"lxvd2x 46, %15, %2 \n\t"
"lxvd2x %x6, %15, %3 \n\t"
"lxvd2x 47, %16, %2 \n\t"
"lxvd2x %x7, %16, %3 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"1: \n\t"
"xvmaddasp 32, 40, 48 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 48, 0, %3 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 48, 0, %3 \n\t"
"xvmaddasp 33, 41, 49 \n\t"
"lxvw4x 41, %10, %2 \n\t"
"lxvw4x 49, %10, %3 \n\t"
"lxvd2x 41, %10, %2 \n\t"
"lxvd2x 49, %10, %3 \n\t"
"xvmaddasp 34, 42, 50 \n\t"
"lxvw4x 42, %11, %2 \n\t"
"lxvw4x 50, %11, %3 \n\t"
"lxvd2x 42, %11, %2 \n\t"
"lxvd2x 50, %11, %3 \n\t"
"xvmaddasp 35, 43, 51 \n\t"
"lxvw4x 43, %12, %2 \n\t"
"lxvw4x 51, %12, %3 \n\t"
"lxvd2x 43, %12, %2 \n\t"
"lxvd2x 51, %12, %3 \n\t"
"xvmaddasp 36, 44, %x4 \n\t"
"lxvw4x 44, %13, %2 \n\t"
"lxvw4x %x4, %13, %3 \n\t"
"lxvd2x 44, %13, %2 \n\t"
"lxvd2x %x4, %13, %3 \n\t"
"xvmaddasp 37, 45, %x5 \n\t"
"lxvw4x 45, %14, %2 \n\t"
"lxvw4x %x5, %14, %3 \n\t"
"lxvd2x 45, %14, %2 \n\t"
"lxvd2x %x5, %14, %3 \n\t"
"xvmaddasp 38, 46, %x6 \n\t"
"lxvw4x 46, %15, %2 \n\t"
"lxvw4x %x6, %15, %3 \n\t"
"lxvd2x 46, %15, %2 \n\t"
"lxvd2x %x6, %15, %3 \n\t"
"xvmaddasp 39, 47, %x7 \n\t"
"lxvw4x 47, %16, %2 \n\t"
"lxvw4x %x7, %16, %3 \n\t"
"lxvd2x 47, %16, %2 \n\t"
"lxvd2x %x7, %16, %3 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"

View File

@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xscvdpspn 37, %x14 \n\t" // load s to all words
"xxspltw 37, 37, 0 \n\t"
"lxvw4x 32, 0, %3 \n\t" // load x
"lxvw4x 33, %15, %3 \n\t"
"lxvw4x 34, %16, %3 \n\t"
"lxvw4x 35, %17, %3 \n\t"
"lxvd2x 32, 0, %3 \n\t" // load x
"lxvd2x 33, %15, %3 \n\t"
"lxvd2x 34, %16, %3 \n\t"
"lxvd2x 35, %17, %3 \n\t"
"lxvw4x 48, 0, %4 \n\t" // load y
"lxvw4x 49, %15, %4 \n\t"
"lxvw4x 50, %16, %4 \n\t"
"lxvw4x 51, %17, %4 \n\t"
"lxvd2x 48, 0, %4 \n\t" // load y
"lxvd2x 49, %15, %4 \n\t"
"lxvd2x 50, %16, %4 \n\t"
"lxvd2x 51, %17, %4 \n\t"
"addi %3, %3, 64 \n\t"
"addi %4, %4, 64 \n\t"
@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvmulsp 44, 32, 37 \n\t" // s * x
"xvmulsp 45, 33, 37 \n\t"
"lxvw4x 32, 0, %3 \n\t" // load x
"lxvw4x 33, %15, %3 \n\t"
"lxvd2x 32, 0, %3 \n\t" // load x
"lxvd2x 33, %15, %3 \n\t"
"xvmulsp 46, 34, 37 \n\t"
"xvmulsp 47, 35, 37 \n\t"
"lxvw4x 34, %16, %3 \n\t"
"lxvw4x 35, %17, %3 \n\t"
"lxvd2x 34, %16, %3 \n\t"
"lxvd2x 35, %17, %3 \n\t"
"xvmulsp %x9, 48, 37 \n\t" // s * y
"xvmulsp %x10, 49, 37 \n\t"
"lxvw4x 48, 0, %4 \n\t" // load y
"lxvw4x 49, %15, %4 \n\t"
"lxvd2x 48, 0, %4 \n\t" // load y
"lxvd2x 49, %15, %4 \n\t"
"xvmulsp %x11, 50, 37 \n\t"
"xvmulsp %x12, 51, 37 \n\t"
"lxvw4x 50, %16, %4 \n\t"
"lxvw4x 51, %17, %4 \n\t"
"lxvd2x 50, %16, %4 \n\t"
"lxvd2x 51, %17, %4 \n\t"
"xvaddsp 40, 40, %x9 \n\t" // c * x + s * y
"xvaddsp 41, 41, %x10 \n\t" // c * x + s * y
@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
"stxvw4x 40, 0, %3 \n\t" // store x
"stxvw4x 41, %15, %3 \n\t"
"stxvw4x 42, %16, %3 \n\t"
"stxvw4x 43, %17, %3 \n\t"
"stxvd2x 40, 0, %3 \n\t" // store x
"stxvd2x 41, %15, %3 \n\t"
"stxvd2x 42, %16, %3 \n\t"
"stxvd2x 43, %17, %3 \n\t"
"stxvw4x %x5, 0, %4 \n\t" // store y
"stxvw4x %x6, %15, %4 \n\t"
"stxvw4x %x7, %16, %4 \n\t"
"stxvw4x %x8, %17, %4 \n\t"
"stxvd2x %x5, 0, %4 \n\t" // store y
"stxvd2x %x6, %15, %4 \n\t"
"stxvd2x %x7, %16, %4 \n\t"
"stxvd2x %x8, %17, %4 \n\t"
"addi %3, %3, 128 \n\t"
"addi %4, %4, 128 \n\t"
@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
"stxvw4x 40, 0, %3 \n\t" // store x
"stxvw4x 41, %15, %3 \n\t"
"stxvw4x 42, %16, %3 \n\t"
"stxvw4x 43, %17, %3 \n\t"
"stxvd2x 40, 0, %3 \n\t" // store x
"stxvd2x 41, %15, %3 \n\t"
"stxvd2x 42, %16, %3 \n\t"
"stxvd2x 43, %17, %3 \n\t"
"stxvw4x %x5, 0, %4 \n\t" // store y
"stxvw4x %x6, %15, %4 \n\t"
"stxvw4x %x7, %16, %4 \n\t"
"stxvw4x %x8, %17, %4 \n"
"stxvd2x %x5, 0, %4 \n\t" // store y
"stxvd2x %x6, %15, %4 \n\t"
"stxvd2x %x7, %16, %4 \n\t"
"stxvd2x %x8, %17, %4 \n"
"#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
"#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"

View File

@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xscvdpspn %x3, %x3 \n\t"
"xxspltw %x3, %x3, 0 \n\t"
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %4, %2 \n\t"
"lxvw4x 34, %5, %2 \n\t"
"lxvw4x 35, %6, %2 \n\t"
"lxvw4x 36, %7, %2 \n\t"
"lxvw4x 37, %8, %2 \n\t"
"lxvw4x 38, %9, %2 \n\t"
"lxvw4x 39, %10, %2 \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %4, %2 \n\t"
"lxvd2x 34, %5, %2 \n\t"
"lxvd2x 35, %6, %2 \n\t"
"lxvd2x 36, %7, %2 \n\t"
"lxvd2x 37, %8, %2 \n\t"
"lxvd2x 38, %9, %2 \n\t"
"lxvd2x 39, %10, %2 \n\t"
"addi %2, %2, 128 \n\t"
@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t"
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %4, %2 \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %4, %2 \n\t"
"xvmulsp 42, 34, %x3 \n\t"
"xvmulsp 43, 35, %x3 \n\t"
"lxvw4x 34, %5, %2 \n\t"
"lxvw4x 35, %6, %2 \n\t"
"lxvd2x 34, %5, %2 \n\t"
"lxvd2x 35, %6, %2 \n\t"
"xvmulsp 44, 36, %x3 \n\t"
"xvmulsp 45, 37, %x3 \n\t"
"lxvw4x 36, %7, %2 \n\t"
"lxvw4x 37, %8, %2 \n\t"
"lxvd2x 36, %7, %2 \n\t"
"lxvd2x 37, %8, %2 \n\t"
"xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t"
"lxvw4x 38, %9, %2 \n\t"
"lxvw4x 39, %10, %2 \n\t"
"lxvd2x 38, %9, %2 \n\t"
"lxvd2x 39, %10, %2 \n\t"
"addi %2, %2, -128 \n\t"
"stxvw4x 40, 0, %2 \n\t"
"stxvw4x 41, %4, %2 \n\t"
"stxvw4x 42, %5, %2 \n\t"
"stxvw4x 43, %6, %2 \n\t"
"stxvw4x 44, %7, %2 \n\t"
"stxvw4x 45, %8, %2 \n\t"
"stxvw4x 46, %9, %2 \n\t"
"stxvw4x 47, %10, %2 \n\t"
"stxvd2x 40, 0, %2 \n\t"
"stxvd2x 41, %4, %2 \n\t"
"stxvd2x 42, %5, %2 \n\t"
"stxvd2x 43, %6, %2 \n\t"
"stxvd2x 44, %7, %2 \n\t"
"stxvd2x 45, %8, %2 \n\t"
"stxvd2x 46, %9, %2 \n\t"
"stxvd2x 47, %10, %2 \n\t"
"addi %2, %2, 256 \n\t"
@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t"
"stxvw4x 40, 0, %2 \n\t"
"stxvw4x 41, %4, %2 \n\t"
"stxvw4x 42, %5, %2 \n\t"
"stxvw4x 43, %6, %2 \n\t"
"stxvw4x 44, %7, %2 \n\t"
"stxvw4x 45, %8, %2 \n\t"
"stxvw4x 46, %9, %2 \n\t"
"stxvw4x 47, %10, %2 \n"
"stxvd2x 40, 0, %2 \n\t"
"stxvd2x 41, %4, %2 \n\t"
"stxvd2x 42, %5, %2 \n\t"
"stxvd2x 43, %6, %2 \n\t"
"stxvd2x 44, %7, %2 \n\t"
"stxvd2x 45, %8, %2 \n\t"
"stxvd2x 46, %9, %2 \n\t"
"stxvd2x 47, %10, %2 \n"
"#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
:
@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
".p2align 5 \n"
"1: \n\t"
"stxvw4x %x3, 0, %2 \n\t"
"stxvw4x %x3, %4, %2 \n\t"
"stxvw4x %x3, %5, %2 \n\t"
"stxvw4x %x3, %6, %2 \n\t"
"stxvw4x %x3, %7, %2 \n\t"
"stxvw4x %x3, %8, %2 \n\t"
"stxvw4x %x3, %9, %2 \n\t"
"stxvw4x %x3, %10, %2 \n\t"
"stxvd2x %x3, 0, %2 \n\t"
"stxvd2x %x3, %4, %2 \n\t"
"stxvd2x %x3, %5, %2 \n\t"
"stxvd2x %x3, %6, %2 \n\t"
"stxvd2x %x3, %7, %2 \n\t"
"stxvd2x %x3, %8, %2 \n\t"
"stxvd2x %x3, %9, %2 \n\t"
"stxvd2x %x3, %10, %2 \n\t"
"addi %2, %2, 128 \n\t"

View File

@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
"lxvw4x 32, 0, %4 \n\t"
"lxvw4x 33, %5, %4 \n\t"
"lxvw4x 34, %6, %4 \n\t"
"lxvw4x 35, %7, %4 \n\t"
"lxvw4x 36, %8, %4 \n\t"
"lxvw4x 37, %9, %4 \n\t"
"lxvw4x 38, %10, %4 \n\t"
"lxvw4x 39, %11, %4 \n\t"
"lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t"
"lxvd2x 34, %6, %4 \n\t"
"lxvd2x 35, %7, %4 \n\t"
"lxvd2x 36, %8, %4 \n\t"
"lxvd2x 37, %9, %4 \n\t"
"lxvd2x 38, %10, %4 \n\t"
"lxvd2x 39, %11, %4 \n\t"
"lxvw4x 40, 0, %3 \n\t"
"lxvw4x 41, %5, %3 \n\t"
"lxvw4x 42, %6, %3 \n\t"
"lxvw4x 43, %7, %3 \n\t"
"lxvw4x 44, %8, %3 \n\t"
"lxvw4x 45, %9, %3 \n\t"
"lxvw4x 46, %10, %3 \n\t"
"lxvw4x 47, %11, %3 \n\t"
"lxvd2x 40, 0, %3 \n\t"
"lxvd2x 41, %5, %3 \n\t"
"lxvd2x 42, %6, %3 \n\t"
"lxvd2x 43, %7, %3 \n\t"
"lxvd2x 44, %8, %3 \n\t"
"lxvd2x 45, %9, %3 \n\t"
"lxvd2x 46, %10, %3 \n\t"
"lxvd2x 47, %11, %3 \n\t"
"stxvw4x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t"
"stxvw4x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t"
"stxvw4x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t"
"stxvw4x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"
"stxvd2x 34, %6, %3 \n\t"
"stxvd2x 35, %7, %3 \n\t"
"stxvd2x 36, %8, %3 \n\t"
"stxvd2x 37, %9, %3 \n\t"
"stxvd2x 38, %10, %3 \n\t"
"stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvw4x 40, 0, %4 \n\t"
"stxvw4x 41, %5, %4 \n\t"
"stxvw4x 42, %6, %4 \n\t"
"stxvw4x 43, %7, %4 \n\t"
"stxvw4x 44, %8, %4 \n\t"
"stxvw4x 45, %9, %4 \n\t"
"stxvw4x 46, %10, %4 \n\t"
"stxvw4x 47, %11, %4 \n\t"
"stxvd2x 40, 0, %4 \n\t"
"stxvd2x 41, %5, %4 \n\t"
"stxvd2x 42, %6, %4 \n\t"
"stxvd2x 43, %7, %4 \n\t"
"stxvd2x 44, %8, %4 \n\t"
"stxvd2x 45, %9, %4 \n\t"
"stxvd2x 46, %10, %4 \n\t"
"stxvd2x 47, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"

22
relapack/LICENSE Normal file
View File

@ -0,0 +1,22 @@
The MIT License (MIT)
Copyright (c) 2016 Elmar Peise
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

98
relapack/Makefile Normal file
View File

@ -0,0 +1,98 @@
TOPDIR = ..
include $(TOPDIR)/Makefile.system
SRC = $(wildcard src/*.c)
SRC1 = \
src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \
src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \
src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \
src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c
SRC2 = \
src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/ztrsyl_rec2.c \
src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c \
src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \
src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c
SRCX = \
src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \
src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \
src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \
src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c
OBJS1 = $(SRC1:%.c=%.$(SUFFIX))
OBJS2 = $(SRC2:%.c=%.o)
OBJS = $(OBJS1) $(OBJS2)
TEST_SUITS = \
slauum dlauum clauum zlauum \
spotrf dpotrf cpotrf zpotrf \
spbtrf dpbtrf cpbtrf zpbtrf \
ssygst dsygst chegst zhegst \
ssytrf dsytrf csytrf chetrf zsytrf zhetrf \
sgetrf dgetrf cgetrf zgetrf \
sgbtrf dgbtrf cgbtrf zgbtrf \
strsyl dtrsyl ctrsyl ztrsyl \
stgsyl dtgsyl ctgsyl ztgsyl \
sgemmt dgemmt cgemmt zgemmt
TESTS = $(TEST_SUITS:%=test/%.pass) # dummies
TEST_EXES = $(TEST_SUITS:%=test/%.x)
LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm
.SECONDARY: $(TEST_EXES)
.PHONY: test
# ReLAPACK compilation
libs: $(OBJS)
@echo "Building ReLAPACK library $(LIBNAME)"
$(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS)
$(RANLIB) $(TOPDIR)/$(LIBNAME)
%.$(SUFFIX): %.c config.h
$(CC) $(CFLAGS) -c $< -o $@
%.o: %.c config.h
$(CC) $(CFLAGS) -c $< -o $@
# ReLAPACK testing
test: $(TEST_EXES) $(TESTS)
@echo "passed all tests"
test/%.pass: test/%.x
@echo -n $*:
@./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<)
test/s%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
$(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
test/d%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
$(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
test/c%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
$(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
test/z%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
$(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
# cleaning up
clean:
rm -f $(OBJS) test/util.$(SUFFIX) test/*.x

68
relapack/README.md Normal file
View File

@ -0,0 +1,68 @@
ReLAPACK
========
[![Build Status](https://travis-ci.org/HPAC/ReLAPACK.svg?branch=master)](https://travis-ci.org/HPAC/ReLAPACK)
[Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK)
ReLAPACK offers a collection of recursive algorithms for many of LAPACK's
compute kernels. Since it preserves LAPACK's established interfaces, ReLAPACK
integrates effortlessly into existing application codes. ReLAPACK's routines
not only outperform the reference LAPACK but also improve upon the performance
of tuned implementations, such as OpenBLAS and MKL.
Coverage
--------
For a detailed list of covered operations and an overview of operations to which
recursion is not efficiently applicable, see [coverage.md](coverage.md).
Installation
------------
To compile with the default configuration, simply run `make` to create the
library `librelapack.a`.
### Linking with MKL
Note that to link with MKL, you currently need to set the flag
`COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and
`ztrsyl`. For further configuration options see [config.md](config.md).
### Dependencies
ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked
kernels from [LAPACK](http://www.netlib.org/lapack/). There are many optimized
and machine specific implementations of these libraries, which are commonly
provided by hardware vendors or available as open source (e.g.,
[OpenBLAS](http://www.openblas.net/)).
Testing
-------
ReLAPACK's test suite compares its routines numerically with LAPACK's
counterparts. To set up the tests (located int `test/`) you need to specify
link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then
`make test` runs the tests. For details on the performed tests, see
[test/README.md](test/README.md).
Examples
--------
Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the
covered routines applies directly to ReLAPACK. A few separate examples are
given in `examples/`. For details, see [examples/README.md](examples/README.md).
Citing
------
When referencing ReLAPACK, please cite the preprint of the paper
[Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763):
@article{relapack,
author = {Elmar Peise and Paolo Bientinesi},
title = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection},
journal = {CoRR},
volume = {abs/1602.06763},
year = {2016},
url = {http://arxiv.org/abs/1602.06763},
}

208
relapack/config.h Normal file
View File

@ -0,0 +1,208 @@
#ifndef RELAPACK_CONFIG_H
#define RELAPACK_CONFIG_H
// ReLAPACK configuration file.
// See also config.md
///////////////////////////////
// BLAS/LAPACK obect symbols //
///////////////////////////////
// BLAS routines linked against have a trailing underscore
#define BLAS_UNDERSCORE 1
// LAPACK routines linked against have a trailing underscore
#define LAPACK_UNDERSCORE BLAS_UNDERSCORE
// Complex BLAS/LAPACK routines return their result in the first argument
// This option must be enabled when linking to MKL for ctrsyl and ztrsyl to
// work.
#define COMPLEX_FUNCTIONS_AS_ROUTINES 0
#ifdef F_INTERFACE_INTEL
#define COMPLEX_FUNCTIONS_AS_ROUTINES 1
#endif
#define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
#define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
// The BLAS-like extension xgemmt is provided by an external library.
#define HAVE_XGEMMT 0
////////////////////////////
// Use malloc in ReLAPACK //
////////////////////////////
#define ALLOW_MALLOC 1
// allow malloc in xsygst for improved performance
#define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
// allow malloc in xsytrf if the passed work buffer is too small
#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
////////////////////////////////
// LAPACK routine replacement //
////////////////////////////////
// The following macros specify which routines are included in the library under
// LAPACK's symbol names: 1 included, 0 not included
#define INCLUDE_ALL 1
#define INCLUDE_XLAUUM INCLUDE_ALL
#define INCLUDE_SLAUUM INCLUDE_XLAUUM
#define INCLUDE_DLAUUM INCLUDE_XLAUUM
#define INCLUDE_CLAUUM INCLUDE_XLAUUM
#define INCLUDE_ZLAUUM INCLUDE_XLAUUM
#define INCLUDE_XSYGST INCLUDE_ALL
#define INCLUDE_SSYGST INCLUDE_XSYGST
#define INCLUDE_DSYGST INCLUDE_XSYGST
#define INCLUDE_CHEGST INCLUDE_XSYGST
#define INCLUDE_ZHEGST INCLUDE_XSYGST
#define INCLUDE_XTRTRI INCLUDE_ALL
#define INCLUDE_STRTRI INCLUDE_XTRTRI
#define INCLUDE_DTRTRI INCLUDE_XTRTRI
#define INCLUDE_CTRTRI INCLUDE_XTRTRI
#define INCLUDE_ZTRTRI INCLUDE_XTRTRI
#define INCLUDE_XPOTRF INCLUDE_ALL
#define INCLUDE_SPOTRF INCLUDE_XPOTRF
#define INCLUDE_DPOTRF INCLUDE_XPOTRF
#define INCLUDE_CPOTRF INCLUDE_XPOTRF
#define INCLUDE_ZPOTRF INCLUDE_XPOTRF
#define INCLUDE_XPBTRF INCLUDE_ALL
#define INCLUDE_SPBTRF INCLUDE_XPBTRF
#define INCLUDE_DPBTRF INCLUDE_XPBTRF
#define INCLUDE_CPBTRF INCLUDE_XPBTRF
#define INCLUDE_ZPBTRF INCLUDE_XPBTRF
#define INCLUDE_XSYTRF INCLUDE_ALL
#define INCLUDE_SSYTRF INCLUDE_XSYTRF
#define INCLUDE_DSYTRF INCLUDE_XSYTRF
#define INCLUDE_CSYTRF INCLUDE_XSYTRF
#define INCLUDE_CHETRF INCLUDE_XSYTRF
#define INCLUDE_ZSYTRF INCLUDE_XSYTRF
#define INCLUDE_ZHETRF INCLUDE_XSYTRF
#define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF
#define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF
#define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF
#define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF
#define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF
#define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF
#define INCLUDE_XGETRF INCLUDE_ALL
#define INCLUDE_SGETRF INCLUDE_XGETRF
#define INCLUDE_DGETRF INCLUDE_XGETRF
#define INCLUDE_CGETRF INCLUDE_XGETRF
#define INCLUDE_ZGETRF INCLUDE_XGETRF
#define INCLUDE_XGBTRF INCLUDE_ALL
#define INCLUDE_SGBTRF INCLUDE_XGBTRF
#define INCLUDE_DGBTRF INCLUDE_XGBTRF
#define INCLUDE_CGBTRF INCLUDE_XGBTRF
#define INCLUDE_ZGBTRF INCLUDE_XGBTRF
#define INCLUDE_XTRSYL INCLUDE_ALL
#define INCLUDE_STRSYL INCLUDE_XTRSYL
#define INCLUDE_DTRSYL INCLUDE_XTRSYL
#define INCLUDE_CTRSYL INCLUDE_XTRSYL
#define INCLUDE_ZTRSYL INCLUDE_XTRSYL
#define INCLUDE_XTGSYL INCLUDE_ALL
#define INCLUDE_STGSYL INCLUDE_XTGSYL
#define INCLUDE_DTGSYL INCLUDE_XTGSYL
#define INCLUDE_CTGSYL INCLUDE_XTGSYL
#define INCLUDE_ZTGSYL INCLUDE_XTGSYL
#define INCLUDE_XGEMMT 0
#define INCLUDE_SGEMMT INCLUDE_XGEMMT
#define INCLUDE_DGEMMT INCLUDE_XGEMMT
#define INCLUDE_CGEMMT INCLUDE_XGEMMT
#define INCLUDE_ZGEMMT INCLUDE_XGEMMT
/////////////////////
// crossover sizes //
/////////////////////
// default crossover size
#define CROSSOVER 24
// individual crossover sizes
#define CROSSOVER_XLAUUM CROSSOVER
#define CROSSOVER_SLAUUM CROSSOVER_XLAUUM
#define CROSSOVER_DLAUUM CROSSOVER_XLAUUM
#define CROSSOVER_CLAUUM CROSSOVER_XLAUUM
#define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM
#define CROSSOVER_XSYGST CROSSOVER
#define CROSSOVER_SSYGST CROSSOVER_XSYGST
#define CROSSOVER_DSYGST CROSSOVER_XSYGST
#define CROSSOVER_CHEGST CROSSOVER_XSYGST
#define CROSSOVER_ZHEGST CROSSOVER_XSYGST
#define CROSSOVER_XTRTRI CROSSOVER
#define CROSSOVER_STRTRI CROSSOVER_XTRTRI
#define CROSSOVER_DTRTRI CROSSOVER_XTRTRI
#define CROSSOVER_CTRTRI CROSSOVER_XTRTRI
#define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI
#define CROSSOVER_XPOTRF CROSSOVER
#define CROSSOVER_SPOTRF CROSSOVER_XPOTRF
#define CROSSOVER_DPOTRF CROSSOVER_XPOTRF
#define CROSSOVER_CPOTRF CROSSOVER_XPOTRF
#define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF
#define CROSSOVER_XPBTRF CROSSOVER
#define CROSSOVER_SPBTRF CROSSOVER_XPBTRF
#define CROSSOVER_DPBTRF CROSSOVER_XPBTRF
#define CROSSOVER_CPBTRF CROSSOVER_XPBTRF
#define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF
#define CROSSOVER_XSYTRF CROSSOVER
#define CROSSOVER_SSYTRF CROSSOVER_XSYTRF
#define CROSSOVER_DSYTRF CROSSOVER_XSYTRF
#define CROSSOVER_CSYTRF CROSSOVER_XSYTRF
#define CROSSOVER_CHETRF CROSSOVER_XSYTRF
#define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF
#define CROSSOVER_ZHETRF CROSSOVER_XSYTRF
#define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF
#define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF
#define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF
#define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF
#define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF
#define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF
#define CROSSOVER_XGETRF CROSSOVER
#define CROSSOVER_SGETRF CROSSOVER_XGETRF
#define CROSSOVER_DGETRF CROSSOVER_XGETRF
#define CROSSOVER_CGETRF CROSSOVER_XGETRF
#define CROSSOVER_ZGETRF CROSSOVER_XGETRF
#define CROSSOVER_XGBTRF CROSSOVER
#define CROSSOVER_SGBTRF CROSSOVER_XGBTRF
#define CROSSOVER_DGBTRF CROSSOVER_XGBTRF
#define CROSSOVER_CGBTRF CROSSOVER_XGBTRF
#define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF
#define CROSSOVER_XTRSYL CROSSOVER
#define CROSSOVER_STRSYL CROSSOVER_XTRSYL
#define CROSSOVER_DTRSYL CROSSOVER_XTRSYL
#define CROSSOVER_CTRSYL CROSSOVER_XTRSYL
#define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL
#define CROSSOVER_XTGSYL CROSSOVER
#define CROSSOVER_STGSYL CROSSOVER_XTGSYL
#define CROSSOVER_DTGSYL CROSSOVER_XTGSYL
#define CROSSOVER_CTGSYL CROSSOVER_XTGSYL
#define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL
// sytrf helper routine
#define CROSSOVER_XGEMMT CROSSOVER_XSYTRF
#define CROSSOVER_SGEMMT CROSSOVER_XGEMMT
#define CROSSOVER_DGEMMT CROSSOVER_XGEMMT
#define CROSSOVER_CGEMMT CROSSOVER_XGEMMT
#define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT
#endif /* RELAPACK_CONFIG_H */

87
relapack/config.md Normal file
View File

@ -0,0 +1,87 @@
RELAPACK Configuration
======================
ReLAPACK has two configuration files: `make.inc`, which is included by the
Makefile, and `config.h` which is included in the source files.
Build and Testing Environment
-----------------------------
The build environment (compiler and flags) and the test configuration (linker
flags for BLAS and LAPACK) are specified in `make.inc`. The test matrix size
and error bounds are defined in `test/config.h`.
The library `librelapack.a` is compiled by invoking `make`. The tests are
performed by either `make test` or calling `make` in the test folder.
BLAS/LAPACK complex function interfaces
---------------------------------------
For BLAS and LAPACK functions that return a complex number, there exist two
conflicting (FORTRAN compiler dependent) calling conventions: either the result
is returned as a `struct` of two floating point numbers or an additional first
argument with a pointer to such a `struct` is used. By default ReLAPACK uses
the former (which is what gfortran uses), but it can switch to the latter by
setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK
specific counterparts) to `1` in `config.h`.
**For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.**
(Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases
will segfault or return errors on the order of 1 or larger.)
BLAS extension `xgemmt`
-----------------------
The LDL decompositions require a general matrix-matrix product that updates only
a triangular matrix called `xgemmt`. If the BLAS implementation linked against
provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`;
otherwise, ReLAPACK uses its own recursive implementation of these kernels.
`xgemmt` is provided by MKL.
Routine Selection
-----------------
ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`). If the
corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to
`1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g.,
`dgetrf_`). By default, wrappers for all routines are enabled.
Crossover Size
--------------
The crossover size determines below which matrix sizes ReLAPACK's recursive
algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3
routines. The crossover size is set in `config.h` and can be chosen either
globally for the entire library, by operation, or individually by routine.
Allowing Temporary Buffers
--------------------------
Two of ReLAPACK's routines make use of temporary buffers, which are allocated
and freed within ReLAPACK. Setting `ALLOW_MALLOC` (or one of the routine
specific counterparts) to 0 in `config.h` will disable these buffers. The
affected routines are:
* `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2. As in
LAPACK, this size can be queried by setting `lWork = -1` and the passed
buffer will be used if it is large enough; only if it is not, a local buffer
will be allocated.
The advantage of this mechanism is that ReLAPACK will seamlessly work even
with codes that statically provide too little memory instead of breaking
them.
* `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem
to standard form can use an auxiliary buffer of size n^2 / 2 to avoid
redundant computations. It thereby performs about 30% less FLOPs than
LAPACK.
FORTRAN symbol names
--------------------
ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces.
Since these libraries usually have an underscore to their symbol names, ReLAPACK
has configuration switches in `config.h` to adjust the corresponding routine
names.

212
relapack/coverage.md Normal file
View File

@ -0,0 +1,212 @@
Coverage of ReLAPACK
====================
This file lists all LAPACK compute routines that are covered by recursive
algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which
are not (yet) part of ReLAPACK.
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
- [List of covered LAPACK routines](#list-of-covered-lapack-routines)
- [`xlauum`](#xlauum)
- [`xsygst`](#xsygst)
- [`xtrtri`](#xtrtri)
- [`xpotrf`](#xpotrf)
- [`xpbtrf`](#xpbtrf)
- [`xsytrf`](#xsytrf)
- [`xgetrf`](#xgetrf)
- [`xgbtrf`](#xgbtrf)
- [`xtrsyl`](#xtrsyl)
- [`xtgsyl`](#xtgsyl)
- [Covered BLAS extension](#covered-blas-extension)
- [`xgemmt`](#xgemmt)
- [Not covered yet](#not-covered-yet)
- [`xpstrf`](#xpstrf)
- [Not covered: extra FLOPs](#not-covered-extra-flops)
- [QR decomposition (and related)](#qr-decomposition-and-related)
- [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal)
- [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal)
- [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg)
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
List of covered LAPACK routines
-------------------------------
### `xlauum`
Multiplication of a triangular matrix with its (complex conjugate) transpose,
resulting in a symmetric (Hermitian) matrix.
Routines: `slauum`, `dlauum`, `clauum`, `zlauum`
Operations:
* A = L^T L
* A = U U^T
### `xsygst`
Simultaneous two-sided multiplication of a symmetric matrix with a triangular
matrix and its transpose
Routines: `ssygst`, `dsygst`, `chegst`, `zhegst`
Operations:
* A = inv(L) A inv(L^T)
* A = inv(U^T) A inv(U)
* A = L^T A L
* A = U A U^T
### `xtrtri`
Inversion of a triangular matrix
Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri`
Operations:
* L = inv(L)
* U = inv(U)
### `xpotrf`
Cholesky decomposition of a symmetric (Hermitian) positive definite matrix
Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf`
Operations:
* L L^T = A
* U^T U = A
### `xpbtrf`
Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix
Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf`
Operations:
* L L^T = A
* U^T U = A
### `xsytrf`
LDL decomposition of a symmetric (or Hermitian) matrix
Routines:
* `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`,
* `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`,
`zhetrf_rook`
Operations:
* L D L^T = A
* U^T D U = A
### `xgetrf`
LU decomposition of a general matrix with pivoting
Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf`
Operation: P L U = A
### `xgbtrf`
LU decomposition of a general banded matrix with pivoting
Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf`
Operation: L U = A
### `xtrsyl`
Solution of the quasi-triangular Sylvester equation
Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl`
Operations:
* A X + B Y = C -> X
* A^T X + B Y = C -> X
* A X + B^T Y = C -> X
* A^T X + B^T Y = C -> X
* A X - B Y = C -> X
* A^T X - B Y = C -> X
* A X - B^T Y = C -> X
* A^T X - B^T Y = C -> X
### `xtgsyl`
Solution of the generalized Sylvester equations
Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl`
Operations:
* A R - L B = C, D R - L E = F -> L, R
* A^T R + D^T L = C, R B^T - L E^T = -F -> L, R
Covered BLAS extension
----------------------
### `xgemmt`
Matrix-matrix product updating only a triangular part of the result
Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt`
Operations:
* C = alpha A B + beta C
* C = alpha A B^T + beta C
* C = alpha A^T B + beta C
* C = alpha A^T B^T + beta C
Not covered yet
---------------
The following operation is implemented as a blocked algorithm in LAPACK but
currently not yet covered in ReLAPACK as a recursive algorithm
### `xpstrf`
Cholesky decomposition of a positive semi-definite matrix with complete pivoting.
Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf`
Operations:
* P L L^T P^T = A
* P U^T U P^T = A
Not covered: extra FLOPs
------------------------
The following routines are not covered because recursive variants would require
considerably more FLOPs or operate on banded matrices.
### QR decomposition (and related)
Routines:
* `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf`
* `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf`
* `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf`
* `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf`
* `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf`
Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A
Routines for multiplication with Q:
* `sormqr`, `dormqr`, `cunmqr`, `zunmqr`
* `sormrq`, `dormrq`, `cunmrq`, `zunmrq`
* `sormql`, `dormql`, `cunmql`, `zunmql`
* `sormlq`, `dormlq`, `cunmlq`, `zunmlq`
* `sormrz`, `dormrz`, `cunmrz`, `zunmrz`
Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T
Routines for construction of Q:
* `sorgqr`, `dorgqr`, `cungqr`, `zungqr`
* `sorgrq`, `dorgrq`, `cungrq`, `zungrq`
* `sorgql`, `dorgql`, `cungql`, `zungql`
* `sorglq`, `dorglq`, `cunglq`, `zunglq`
### Symmetric reduction to tridiagonal
Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd`
Operation: Q T Q^T = A
### Symmetric reduction to bidiagonal
Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd`
Operation: Q T P^T = A
### Reduction to upper Hessenberg
Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd`
Operation: Q H Q^T = A

67
relapack/inc/relapack.h Normal file
View File

@ -0,0 +1,67 @@
#ifndef RELAPACK_H
#define RELAPACK_H
void RELAPACK_slauum(const char *, const int *, float *, const int *, int *);
void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *);
void RELAPACK_clauum(const char *, const int *, float *, const int *, int *);
void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *);
void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *);
void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *);
void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *);
void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *);
void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *);
void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *);
void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *);
void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *);
void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *);
void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *);
void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *);
void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *);
void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *);
void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *);
void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *);
void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *);
void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
#endif /* RELAPACK_H */

61
relapack/src/blas.h Normal file
View File

@ -0,0 +1,61 @@
#ifndef BLAS_H
#define BLAS_H
extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *);
extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *);
extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *);
extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *);
extern void BLAS(sscal)(const int *, const float *, float *, const int *);
extern void BLAS(dscal)(const int *, const double *, double *, const int *);
extern void BLAS(cscal)(const int *, const float *, float *, const int *);
extern void BLAS(zscal)(const int *, const double *, double *, const int *);
extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
#if HAVE_XGEMMT
extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
#endif
#endif /* BLAS_H */

230
relapack/src/cgbtrf.c Normal file
View File

@ -0,0 +1,230 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *,
const int *, float *, const int *, int *, float *, const int *, float *,
const int *, int *);
/** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's cgbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html
* */
void RELAPACK_cgbtrf(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kl < 0)
*info = -3;
else if (*ku < 0)
*info = -4;
else if (*ldAb < 2 * *kl + *ku + 1)
*info = -6;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CGBTRF", &minfo);
return;
}
// Constant
const float ZERO[] = { 0., 0. };
// Result upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + 2 * kv;
// Zero upper diagonal fill-in elements
int i, j;
for (j = 0; j < *n; j++) {
float *const A_j = A + 2 * *ldA * j;
for (i = MAX(0, j - kv); i < j - *ku; i++)
A_j[2 * i] = A_j[2 * i + 1] = 0.;
}
// Allocate work space
const int n1 = CREC_SPLIT(*n);
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const int nWorkl = (kv > n1) ? n1 : kv;
const int mWorku = (*kl > n1) ? n1 : *kl;
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float));
float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float));
LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
// Free work space
free(Workl);
free(Worku);
}
/** cgbtrf's recursive compute kernel */
static void RELAPACK_cgbtrf_rec(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
int *info
) {
if (*n <= MAX(CROSSOVER_CGBTRF, 1)) {
// Unblocked
LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Loop iterators
int i, j;
// Output upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + 2 * kv;
// Splitting
const int n1 = MIN(CREC_SPLIT(*n), *kl);
const int n2 = *n - n1;
const int m1 = MIN(n1, *m);
const int m2 = *m - m1;
const int mn1 = MIN(m1, n1);
const int mn2 = MIN(m2, n2);
// Ab_L *
// Ab_BR
float *const Ab_L = Ab;
float *const Ab_BR = Ab + 2 * *ldAb * n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + 2 * *ldA * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * m1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * m1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// Banded splitting
const int n21 = MIN(n2, kv - n1);
const int n22 = MIN(n2 - n21, n1);
const int m21 = MIN(m2, *kl - m1);
const int m22 = MIN(m2 - m21, m1);
// n1 n21 n22
// m * A_Rl ARr
float *const A_Rl = A_R;
float *const A_Rr = A_R + 2 * *ldA * n21;
// n1 n21 n22
// m1 * A_TRl A_TRr
// m21 A_BLt A_BRtl A_BRtr
// m22 A_BLb A_BRbl A_BRbr
float *const A_TRl = A_TR;
float *const A_TRr = A_TR + 2 * *ldA * n21;
float *const A_BLt = A_BL;
float *const A_BLb = A_BL + 2 * m21;
float *const A_BRtl = A_BR;
float *const A_BRtr = A_BR + 2 * *ldA * n21;
float *const A_BRbl = A_BR + 2 * m21;
float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
// Workl = A_BLb
LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
// partially redo swaps in A_L
for (i = 0; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
else
BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
}
}
// apply pivots to A_Rl
LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
// apply pivots to A_Rr columnwise
for (j = 0; j < n22; j++) {
float *const A_Rrj = A_Rr + 2 * *ldA * j;
for (i = j; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
const float tmpr = A_Rrj[2 * i];
const float tmpc = A_Rrj[2 * i + 1];
A_Rrj[2 * i] = A_Rrj[2 * ip];
A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1];
A_Rrj[2 * ip] = tmpr;
A_Rrj[2 * ip + 1] = tmpc;
}
}
}
// A_TRl = A_TL \ A_TRl
BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// Worku = A_TRr
LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
// Worku = A_TL \ Worku
BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
// A_TRr = Worku
LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
// A_BRtl = A_BRtl - A_BLt * A_TRl
BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
// A_BRbl = A_BRbl - Workl * A_TRl
BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
// A_BRtr = A_BRtr - A_BLt * Worku
BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Workl * Worku
BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
// partially undo swaps in A_L
for (i = mn1 - 1; i >= 0; i--) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
else
BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
if (*info)
*info += n1;
// shift pivots
for (i = 0; i < mn2; i++)
ipiv_B[i] += n1;
}

167
relapack/src/cgemmt.c Normal file
View File

@ -0,0 +1,167 @@
#include "relapack.h"
static void RELAPACK_cgemmt_rec(const char *, const char *, const char *,
const int *, const int *, const float *, const float *, const int *,
const float *, const int *, const float *, float *, const int *);
static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *,
const int *, const int *, const float *, const float *, const int *,
const float *, const int *, const float *, float *, const int *);
/** CGEMMT computes a matrix-matrix product with general matrices but updates
* only the upper or lower triangular part of the result matrix.
*
* This routine performs the same operation as the BLAS routine
* cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
* but only updates the triangular part of C specified by uplo:
* If (*uplo == 'L'), only the lower triangular part of C is updated,
* otherwise the upper triangular part is updated.
* */
void RELAPACK_cgemmt(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
#if HAVE_XGEMMT
BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
#else
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int notransA = LAPACK(lsame)(transA, "N");
const int tranA = LAPACK(lsame)(transA, "T");
const int ctransA = LAPACK(lsame)(transA, "C");
const int notransB = LAPACK(lsame)(transB, "N");
const int tranB = LAPACK(lsame)(transB, "T");
const int ctransB = LAPACK(lsame)(transB, "C");
int info = 0;
if (!lower && !upper)
info = 1;
else if (!tranA && !ctransA && !notransA)
info = 2;
else if (!tranB && !ctransB && !notransB)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*ldA < MAX(1, notransA ? *n : *k))
info = 8;
else if (*ldB < MAX(1, notransB ? *k : *n))
info = 10;
else if (*ldC < MAX(1, *n))
info = 13;
if (info) {
LAPACK(xerbla)("CGEMMT", &info);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');
// Recursive kernel
RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
#endif
}
/** cgemmt's recursive compute kernel */
static void RELAPACK_cgemmt_rec(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
if (*n <= MAX(CROSSOVER_CGEMMT, 1)) {
// Unblocked
RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
}
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_T
// A_B
const float *const A_T = A;
const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);
// B_L B_R
const float *const B_L = B;
const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);
// C_TL C_TR
// C_BL C_BR
float *const C_TL = C;
float *const C_TR = C + 2 * *ldC * n1;
float *const C_BL = C + 2 * n1;
float *const C_BR = C + 2 * *ldC * n1 + 2 * n1;
// recursion(C_TL)
RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
if (*uplo == 'L')
// C_BL = alpha A_B B_L + beta C_BL
BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
else
// C_TR = alpha A_T B_R + beta C_TR
BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
// recursion(C_BR)
RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
}
/** cgemmt's unblocked compute kernel */
static void RELAPACK_cgemmt_rec2(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
const int incB = (*transB == 'N') ? 1 : *ldB;
const int incC = 1;
int i;
for (i = 0; i < *n; i++) {
// A_0
// A_i
const float *const A_0 = A;
const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);
// * B_i *
const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);
// * C_0i *
// * C_ii *
float *const C_0i = C + 2 * *ldC * i;
float *const C_ii = C + 2 * *ldC * i + 2 * i;
if (*uplo == 'L') {
const int nmi = *n - i;
if (*transA == 'N')
BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
else
BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
} else {
const int ip1 = i + 1;
if (*transA == 'N')
BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
else
BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
}
}
}

117
relapack/src/cgetrf.c Normal file
View File

@ -0,0 +1,117 @@
#include "relapack.h"
static void RELAPACK_cgetrf_rec(const int *, const int *, float *,
const int *, int *, int *);
/** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's cgetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html
*/
void RELAPACK_cgetrf(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CGETRF", &minfo);
return;
}
const int sn = MIN(*m, *n);
RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
if (*m < *n) {
// Constants
const float ONE[] = { 1., 0. };
const int iONE[] = { 1 };
// Splitting
const int rn = *n - *m;
// A_L A_R
const float *const A_L = A;
float *const A_R = A + 2 * *ldA * *m;
// A_R = apply(ipiv, A_R)
LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
// A_R = A_L \ A_R
BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
}
}
/** cgetrf's recursive compute kernel */
static void RELAPACK_cgetrf_rec(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
if (*n <= MAX(CROSSOVER_CGETRF, 1)) {
// Unblocked
LAPACK(cgetf2)(m, n, A, ldA, ipiv, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
const int m2 = *m - n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + 2 * *ldA * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// recursion(A_L, ipiv_T)
RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
// apply pivots to A_R
LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
// A_TR = A_TL \ A_TR
BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_BL * A_TR
BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
// recursion(A_BR, ipiv_B)
RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
if (*info)
*info += n1;
// apply pivots to A_BL
LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
// shift pivots
int i;
for (i = 0; i < n2; i++)
ipiv_B[i] += n1;
}

212
relapack/src/chegst.c Normal file
View File

@ -0,0 +1,212 @@
#include "relapack.h"
#if XSYGST_ALLOW_MALLOC
#include "stdlib.h"
#endif
static void RELAPACK_chegst_rec(const int *, const char *, const int *,
float *, const int *, const float *, const int *,
float *, const int *, int *);
/** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
*
* This routine is functionally equivalent to LAPACK's chegst.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html
* */
void RELAPACK_chegst(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (*itype < 1 || *itype > 3)
*info = -1;
else if (!lower && !upper)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
else if (*ldB < MAX(1, *n))
*info = -7;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CHEGST", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Allocate work space
float *Work = NULL;
int lWork = 0;
#if XSYGST_ALLOW_MALLOC
const int n1 = CREC_SPLIT(*n);
lWork = n1 * (*n - n1);
Work = malloc(lWork * 2 * sizeof(float));
if (!Work)
lWork = 0;
#endif
// recursive kernel
RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
// Free work space
#if XSYGST_ALLOW_MALLOC
if (Work)
free(Work);
#endif
}
/** chegst's recursive compute kernel */
static void RELAPACK_chegst_rec(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
float *Work, const int *lWork, int *info
) {
if (*n <= MAX(CROSSOVER_CHEGST, 1)) {
// Unblocked
LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info);
return;
}
// Constants
const float ZERO[] = { 0., 0. };
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const float HALF[] = { .5, 0. };
const float MHALF[] = { -.5, 0. };
const int iONE[] = { 1 };
// Loop iterator
int i;
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// B_TL B_TR
// B_BL B_BR
const float *const B_TL = B;
const float *const B_TR = B + 2 * *ldB * n1;
const float *const B_BL = B + 2 * n1;
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// recursion(A_TL, B_TL)
RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
if (*itype == 1)
if (*uplo == 'L') {
// A_BL = A_BL / B_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * B_BL * A_TL
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
} else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR \ A_BL
BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL' \ A_TR
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * A_TL * B_TR
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
} else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR / B_BR
BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
else
if (*uplo == 'L') {
// A_BL = A_BL * B_TL
BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * A_BR * B_BL
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
} else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR * A_BL
BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL * A_TR
BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * B_TR * A_BR
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
} else
// A_TR = A_TR + 1/2 B_TR A_BR
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
else
// A_TR = A_TR + 1/2 B_TR * A_BR
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR * B_BR
BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
// recursion(A_BR, B_BR)
RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
}

236
relapack/src/chetrf.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's chetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html
* */
void RELAPACK_chetrf(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CHETRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** chetrf's recursive compute kernel */
static void RELAPACK_chetrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
// Unblocked
if (top) {
LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

520
relapack/src/chetrf_rec2.c Normal file
View File

@ -0,0 +1,520 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
*
* This routine is a minor modification of LAPACK's clahef.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int *
nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2, r__3, r__4;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k;
static float t, r1;
static complex d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static float alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen), ccopy_(int *, complex *, int *,
complex *, int *), cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float absakk;
extern /* Subroutine */ int clacgv_(int *, complex *, int *);
extern int icamax_(int *, complex *, int *);
extern /* Subroutine */ int csscal_(int *, float *, complex *, int
*);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
i__1 = k - 1;
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k + kw * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
i__1 = k + kw * w_dim1;
i__2 = k + kw * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - 1;
ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
i__1 = k - imax;
clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + (kw - 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (kw - 1) * w_dim1]), dabs(r__2));
if (imax > 1) {
i__1 = imax - 1;
jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
i__1 = jmax + (kw - 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (kw - 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kk - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
i__1 = kk - 1 - kp;
clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
r1 = 1.f / a[i__1].r;
i__1 = k - 1;
csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + kw * w_dim1], &q__2);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
q__2.r = t, q__2.i = 0.f;
c_div(&q__1, &q__2, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
r_cnjg(&q__2, &d21);
i__3 = j + kw * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
q__2.r * q__3.i + q__2.i * q__3.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k - 2;
clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
i__1 = k + k * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
w_dim1], &c__1);
}
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
i__1 = k + k * w_dim1;
i__2 = k + k * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = imax - k;
clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (imax < *n) {
i__1 = *n - imax;
ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
imax + 1 + (k + 1) * w_dim1], &c__1);
}
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + (k + 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (k + 1) * w_dim1]), dabs(r__2));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
i__1 = jmax + (k + 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (k + 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
k * w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k + kstep - 1;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kp - kk - 1;
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
i__1 = kp - kk - 1;
clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
r1 = 1.f / a[i__1].r;
i__1 = *n - k;
csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + k * w_dim1], &q__2);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
q__2.r = t, q__2.i = 0.f;
c_div(&q__1, &q__2, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
r_cnjg(&q__2, &d21);
i__3 = j + k * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
q__2.r * q__3.i + q__2.i * q__3.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = *n - k - 1;
clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/chetrf_rook.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's chetrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html
* */
void RELAPACK_chetrf_rook(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CHETRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** chetrf_rook's recursive compute kernel */
static void RELAPACK_chetrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
// Unblocked
if (top) {
LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@ -0,0 +1,661 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
*
* This routine is a minor modification of LAPACK's clahef_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, complex *a, int *lda, int *ipiv,
complex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2;
complex q__1, q__2, q__3, q__4, q__5;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k, p;
static float t, r1;
static complex d11, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static float alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen);
static float sfmin;
extern /* Subroutine */ int ccopy_(int *, complex *, int *,
complex *, int *);
static int itemp;
extern /* Subroutine */ int cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float stemp, absakk;
extern /* Subroutine */ int clacgv_(int *, complex *, int *);
extern int icamax_(int *, complex *, int *);
extern double slamch_(char *, ftnlen);
extern /* Subroutine */ int csscal_(int *, float *, complex *, int
*);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
sfmin = slamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
if (k > 1) {
i__1 = k - 1;
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
c__1);
}
i__1 = k + kw * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
i__1 = k + kw * w_dim1;
i__2 = k + kw * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
r__1 = w[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
if (k > 1) {
i__1 = k - 1;
ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
&c__1);
}
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
if (imax > 1) {
i__1 = imax - 1;
ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
}
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
i__1 = k - imax;
clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + (kw - 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
i__1 = itemp + (kw - 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (kw - 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = p + p * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = k - 1 - p;
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
i__1 = k - 1 - p;
clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
if (p > 1) {
i__1 = p - 1;
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kk - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
i__1 = kk - 1 - kp;
clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
t = a[i__1].r;
if (dabs(t) >= sfmin) {
r1 = 1.f / t;
i__1 = k - 1;
csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
i__3 = ii + k * a_dim1;
q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L14: */
}
}
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + kw * w_dim1], &q__2);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
r_cnjg(&q__5, &d21);
c_div(&q__2, &q__3, &q__5);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k - 2;
clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
++jj;
if (kstep == 2 && jp1 != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = k + k * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
w_dim1], &c__1);
}
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
i__1 = k + k * w_dim1;
i__2 = k + k * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
r__1 = w[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
a_dim1], &c__1);
}
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = imax - k;
clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (imax < *n) {
i__1 = *n - imax;
ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
imax + 1 + (k + 1) * w_dim1], &c__1);
}
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
1) * w_dim1], &c__1, (ftnlen)12);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + (k + 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (k + 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
i__1 = itemp + (k + 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (k + 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (k + 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p + p * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = p - k - 1;
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
a_dim1], lda);
i__1 = p - k - 1;
clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
if (p < *n) {
i__1 = *n - p;
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
* a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
}
cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kp - kk - 1;
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
i__1 = kp - kk - 1;
clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
t = a[i__1].r;
if (dabs(t) >= sfmin) {
r1 = 1.f / t;
i__1 = *n - k;
csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
i__3 = ii + k * a_dim1;
q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L74: */
}
}
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + k * w_dim1], &q__2);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
r_cnjg(&q__5, &d21);
c_div(&q__2, &q__3, &q__5);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = *n - k - 1;
clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
--jj;
if (kstep == 2 && jp1 != jj && j >= 1) {
cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

87
relapack/src/clauum.c Normal file
View File

@ -0,0 +1,87 @@
#include "relapack.h"
static void RELAPACK_clauum_rec(const char *, const int *, float *,
const int *, int *);
/** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
*
* This routine is functionally equivalent to LAPACK's clauum.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html
* */
void RELAPACK_clauum(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CLAUUM", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info);
}
/** clauum's recursive compute kernel */
static void RELAPACK_clauum_rec(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_CLAUUM, 1)) {
// Unblocked
LAPACK(clauu2)(uplo, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info);
if (*uplo == 'L') {
// A_TL = A_TL + A_BL' * A_BL
BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
// A_BL = A_BR' * A_BL
BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TL = A_TL + A_TR * A_TR'
BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
// A_TR = A_TR * A_BR'
BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info);
}

157
relapack/src/cpbtrf.c Normal file
View File

@ -0,0 +1,157 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *,
float *, const int *, float *, const int *, int *);
/** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
*
* This routine is functionally equivalent to LAPACK's cpbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html
* */
void RELAPACK_cpbtrf(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kd < 0)
*info = -3;
else if (*ldAb < *kd + 1)
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CPBTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Constant
const float ZERO[] = { 0., 0. };
// Allocate work space
const int n1 = CREC_SPLIT(*n);
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
float *Work = malloc(mWork * nWork * 2 * sizeof(float));
LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
// Recursive kernel
RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
// Free work space
free(Work);
}
/** cpbtrf's recursive compute kernel */
static void RELAPACK_cpbtrf_rec(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
float *Work, const int *ldWork,
int *info
){
if (*n <= MAX(CROSSOVER_CPBTRF, 1)) {
// Unblocked
LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
// Splitting
const int n1 = MIN(CREC_SPLIT(*n), *kd);
const int n2 = *n - n1;
// * *
// * Ab_BR
float *const Ab_BR = Ab + 2 * *ldAb * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
// Banded splitting
const int n21 = MIN(n2, *kd - n1);
const int n22 = MIN(n2 - n21, *kd);
// n1 n21 n22
// n1 * A_TRl A_TRr
// n21 A_BLt A_BRtl A_BRtr
// n22 A_BLb A_BRbl A_BRbr
float *const A_TRl = A_TR;
float *const A_TRr = A_TR + 2 * *ldA * n21;
float *const A_BLt = A_BL;
float *const A_BLb = A_BL + 2 * n21;
float *const A_BRtl = A_BR;
float *const A_BRtr = A_BR + 2 * *ldA * n21;
float *const A_BRbl = A_BR + 2 * n21;
float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21;
if (*uplo == 'L') {
// A_BLt = ABLt / A_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
// A_BRtl = A_BRtl - A_BLt * A_BLt'
BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
// Work = A_BLb
LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
// Work = Work / A_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
// A_BRbl = A_BRbl - Work * A_BLt'
BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
// A_BRbr = A_BRbr - Work * Work'
BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_BLb = Work
LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
} else {
// A_TRl = A_TL' \ A_TRl
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// A_BRtl = A_BRtl - A_TRl' * A_TRl
BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
// Work = A_TRr
LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
// Work = A_TL' \ Work
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
// A_BRtr = A_BRtr - A_TRl' * Work
BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Work' * Work
BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_TRr = Work
LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
}
// recursion(A_BR)
if (*kd > n1)
RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info);
else
RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
if (*info)
*info += n1;
}

92
relapack/src/cpotrf.c Normal file
View File

@ -0,0 +1,92 @@
#include "relapack.h"
static void RELAPACK_cpotrf_rec(const char *, const int *, float *,
const int *, int *);
/** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
*
* This routine is functionally equivalent to LAPACK's cpotrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html
* */
void RELAPACK_cpotrf(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CPOTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info);
}
/** cpotrf's recursive compute kernel */
static void RELAPACK_cpotrf_rec(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_CPOTRF, 1)) {
// Unblocked
LAPACK(cpotf2)(uplo, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = A_BL / A_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
// A_BR = A_BR - A_BL * A_BL'
BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
} else {
// A_TR = A_TL' \ A_TR
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_TR' * A_TR
BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
}
// recursion(A_BR)
RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

238
relapack/src/csytrf.c Normal file
View File

@ -0,0 +1,238 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's csytrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html
* */
void RELAPACK_csytrf(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy arguments
int nout;
// Recursive kernel
RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** csytrf's recursive compute kernel */
static void RELAPACK_csytrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CSYTRF, 3)) {
// Unblocked
if (top) {
LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Loop iterator
int i;
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

451
relapack/src/csytrf_rec2.c Normal file
View File

@ -0,0 +1,451 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
*
* This routine is a minor modification of LAPACK's clasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int *
nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2, r__3, r__4;
complex q__1, q__2, q__3;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k;
static complex t, r1, d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static float alpha;
extern /* Subroutine */ int cscal_(int *, complex *, complex *,
int *);
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen), ccopy_(int *, complex *, int *,
complex *, int *), cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float absakk;
extern int icamax_(int *, complex *, int *);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
kstep = 1;
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
w_dim1]), dabs(r__2));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (kw - 1) * w_dim1]), dabs(r__2));
if (imax > 1) {
i__1 = imax - 1;
jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
i__1 = jmax + (kw - 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (kw - 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha *
rowmax) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kk - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = k - 1;
cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + kw * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
c_div(&q__1, &t, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
i__1 = *n - k + 1;
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
kstep = 1;
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
w_dim1]), dabs(r__2));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (k + 1) * w_dim1]), dabs(r__2));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
i__1 = jmax + (k + 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (k + 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha *
rowmax) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
k * w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k + kstep - 1;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kp - kk - 1;
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = *n - k;
cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k + k * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
c_div(&q__1, &t, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/csytrf_rook.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's csytrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html
* */
void RELAPACK_csytrf_rook(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** csytrf_rook's recursive compute kernel */
static void RELAPACK_csytrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@ -0,0 +1,565 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
*
* This routine is a minor modification of LAPACK's clasyf_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, complex *a, int *lda, int *ipiv,
complex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k, p;
static complex t, r1, d11, d12, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static float alpha;
extern /* Subroutine */ int cscal_(int *, complex *, complex *,
int *);
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen);
static float sfmin;
extern /* Subroutine */ int ccopy_(int *, complex *, int *,
complex *, int *);
static int itemp;
extern /* Subroutine */ int cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float stemp, absakk;
extern int icamax_(int *, complex *, int *);
extern double slamch_(char *, ftnlen);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
sfmin = slamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
w_dim1]), dabs(r__2));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
i__1 = itemp + (kw - 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (kw - 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha *
rowmax)) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = k - p;
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
c__1);
i__1 = *n - k + 1;
cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
lda);
i__1 = *n - kk + 1;
cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
i__1 = kp + k * a_dim1;
i__2 = kk + k * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = k - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
c__1);
i__1 = *n - kk + 1;
cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
lda);
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+ k * a_dim1]), dabs(r__2)) >= sfmin) {
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = k - 1;
cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else /* if(complicated condition) */ {
i__1 = k + k * a_dim1;
if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
a_dim1]);
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L14: */
}
}
}
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d12.r = w[i__1].r, d12.i = w[i__1].i;
c_div(&q__1, &w[k + kw * w_dim1], &d12);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d12);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d12);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
jj = j - 1;
if (jp1 != jj && kstep == 2) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = *n - k + 1;
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
}
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
w_dim1]), dabs(r__2));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
1) * w_dim1], &c__1, (ftnlen)12);
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (k + 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
i__1 = itemp + (k + 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (k + 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (k + 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha *
rowmax)) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p - k;
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
lda);
i__1 = *n - p + 1;
ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
c__1);
cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
i__1 = kp + k * a_dim1;
i__2 = kk + k * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kp - k - 1;
ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
* a_dim1], lda);
i__1 = *n - kp + 1;
ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
a_dim1], &c__1);
cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+ k * a_dim1]), dabs(r__2)) >= sfmin) {
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = *n - k;
cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else /* if(complicated condition) */ {
i__1 = k + k * a_dim1;
if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
a_dim1]);
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L74: */
}
}
}
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k + k * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
jj = j + 1;
if (jp1 != jj && kstep == 2) {
cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

268
relapack/src/ctgsyl.c Normal file
View File

@ -0,0 +1,268 @@
#include "relapack.h"
#include <math.h>
static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *,
const int *, const float *, const int *, const float *, const int *,
float *, const int *, const float *, const int *, const float *,
const int *, float *, const int *, float *, float *, float *, int *);
/** CTGSYL solves the generalized Sylvester equation.
*
* This routine is functionally equivalent to LAPACK's ctgsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html
* */
void RELAPACK_ctgsyl(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
// Parse arguments
const int notran = LAPACK(lsame)(trans, "N");
const int tran = LAPACK(lsame)(trans, "C");
// Compute work buffer size
int lwmin = 1;
if (notran && (*ijob == 1 || *ijob == 2))
lwmin = MAX(1, 2 * *m * *n);
*info = 0;
// Check arguments
if (!tran && !notran)
*info = -1;
else if (notran && (*ijob < 0 || *ijob > 4))
*info = -2;
else if (*m <= 0)
*info = -3;
else if (*n <= 0)
*info = -4;
else if (*ldA < MAX(1, *m))
*info = -6;
else if (*ldB < MAX(1, *n))
*info = -8;
else if (*ldC < MAX(1, *m))
*info = -10;
else if (*ldD < MAX(1, *m))
*info = -12;
else if (*ldE < MAX(1, *n))
*info = -14;
else if (*ldF < MAX(1, *m))
*info = -16;
else if (*lWork < lwmin && *lWork != -1)
*info = -20;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CTGSYL", &minfo);
return;
}
if (*lWork == -1) {
// Work size query
*Work = lwmin;
return;
}
// Clean char * arguments
const char cleantrans = notran ? 'N' : 'C';
// Constant
const float ZERO[] = { 0., 0. };
int isolve = 1;
int ifunc = 0;
if (notran) {
if (*ijob >= 3) {
ifunc = *ijob - 2;
LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
} else if (*ijob >= 1)
isolve = 2;
}
float scale2;
int iround;
for (iround = 1; iround <= isolve; iround++) {
*scale = 1;
float dscale = 0;
float dsum = 1;
RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
if (dscale != 0) {
if (*ijob == 1 || *ijob == 3)
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
else
*dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
}
if (isolve == 2) {
if (iround == 1) {
if (notran)
ifunc = *ijob;
scale2 = *scale;
LAPACK(clacpy)("F", m, n, C, ldC, Work, m);
LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
} else {
LAPACK(clacpy)("F", m, n, Work, m, C, ldC);
LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
*scale = scale2;
}
}
}
}
/** ctgsyl's recursive vompute kernel */
static void RELAPACK_ctgsyl_rec(
const char *trans, const int *ifunc, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dsum, float *dscale,
int *info
) {
if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) {
// Unblocked
LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Outputs
float scale1[] = { 1., 0. };
float scale2[] = { 1., 0. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
const int m1 = CREC_SPLIT(*m);
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const float *const A_TL = A;
const float *const A_TR = A + 2 * *ldA * m1;
const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
// C_T
// C_B
float *const C_T = C;
float *const C_B = C + 2 * m1;
// D_TL D_TR
// 0 D_BR
const float *const D_TL = D;
const float *const D_TR = D + 2 * *ldD * m1;
const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1;
// F_T
// F_B
float *const F_T = F;
float *const F_B = F + 2 * m1;
if (*trans == 'N') {
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
// C_T = C_T - A_TR * C_B
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// F_T = F_T - D_TR * C_B
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
}
} else {
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
// C_B = C_B - A_TR^H * C_T
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// C_B = C_B - D_TR^H * F_T
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
}
}
} else {
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const float *const B_TL = B;
const float *const B_TR = B + 2 * *ldB * n1;
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// C_L C_R
float *const C_L = C;
float *const C_R = C + 2 * *ldC * n1;
// E_TL E_TR
// 0 E_BR
const float *const E_TL = E;
const float *const E_TR = E + 2 * *ldE * n1;
const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1;
// F_L F_R
float *const F_L = F;
float *const F_R = F + 2 * *ldF * n1;
if (*trans == 'N') {
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
// C_R = C_R + F_L * B_TR
BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
// F_R = F_R + F_L * E_TR
BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
}
} else {
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
// F_L = F_L + C_R * B_TR
BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
// F_L = F_L + F_R * E_TR
BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
}
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

163
relapack/src/ctrsyl.c Normal file
View File

@ -0,0 +1,163 @@
#include "relapack.h"
static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *,
const int *, const int *, const float *, const int *, const float *,
const int *, float *, const int *, float *, int *);
/** CTRSYL solves the complex Sylvester matrix equation.
*
* This routine is functionally equivalent to LAPACK's ctrsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html
* */
void RELAPACK_ctrsyl(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
// Check arguments
const int notransA = LAPACK(lsame)(tranA, "N");
const int ctransA = LAPACK(lsame)(tranA, "C");
const int notransB = LAPACK(lsame)(tranB, "N");
const int ctransB = LAPACK(lsame)(tranB, "C");
*info = 0;
if (!ctransA && !notransA)
*info = -1;
else if (!ctransB && !notransB)
*info = -2;
else if (*isgn != 1 && *isgn != -1)
*info = -3;
else if (*m < 0)
*info = -4;
else if (*n < 0)
*info = -5;
else if (*ldA < MAX(1, *m))
*info = -7;
else if (*ldB < MAX(1, *n))
*info = -9;
else if (*ldC < MAX(1, *m))
*info = -11;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CTRSYL", &minfo);
return;
}
// Clean char * arguments
const char cleantranA = notransA ? 'N' : 'C';
const char cleantranB = notransB ? 'N' : 'C';
// Recursive kernel
RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
/** ctrsyl's recursive compute kernel */
static void RELAPACK_ctrsyl_rec(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) {
// Unblocked
RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const float MSGN[] = { -*isgn, 0. };
const int iONE[] = { 1 };
// Outputs
float scale1[] = { 1., 0. };
float scale2[] = { 1., 0. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
const int m1 = CREC_SPLIT(*m);
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const float *const A_TL = A;
const float *const A_TR = A + 2 * *ldA * m1;
const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
// C_T
// C_B
float *const C_T = C;
float *const C_B = C + 2 * m1;
if (*tranA == 'N') {
// recusion(A_BR, B, C_B)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
// C_T = C_T - A_TR * C_B
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// recusion(A_TL, B, C_T)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
} else {
// recusion(A_TL, B, C_T)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
// C_B = C_B - A_TR' * C_T
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// recusion(A_BR, B, C_B)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
}
} else {
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const float *const B_TL = B;
const float *const B_TR = B + 2 * *ldB * n1;
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// C_L C_R
float *const C_L = C;
float *const C_R = C + 2 * *ldC * n1;
if (*tranB == 'N') {
// recusion(A, B_TL, C_L)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
// C_R = C_R -/+ C_L * B_TR
BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
// recusion(A, B_BR, C_R)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
} else {
// recusion(A, B_BR, C_R)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
// C_L = C_L -/+ C_R * B_TR'
BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
// recusion(A, B_TL, C_L)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

392
relapack/src/ctrsyl_rec2.c Normal file
View File

@ -0,0 +1,392 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "../config.h"
#include "f2c.h"
#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
extern void cdotu_(complex *, int *, complex *, int *, complex *, int *);
complex result;
cdotu_(&result, n, x, incx, y, incy);
return result;
}
#define cdotu_ cdotu_fun
complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
extern void cdotc_(complex *, int *, complex *, int *, complex *, int *);
complex result;
cdotc_(&result, n, x, incx, y, incy);
return result;
}
#define cdotc_ cdotc_fun
#endif
#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
complex cladiv_fun(complex *a, complex *b) {
extern void cladiv_(complex *, complex *, complex *);
complex result;
cladiv_(&result, a, b);
return result;
}
#define cladiv_ cladiv_fun
#endif
/* Table of constant values */
static int c__1 = 1;
/** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
*
* This routine is an exact copy of LAPACK's ctrsyl.
* It serves as an unblocked kernel in the recursive algorithms.
* */
/* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int
*isgn, int *m, int *n, complex *a, int *lda, complex *b,
int *ldb, complex *c__, int *ldc, float *scale, int *info,
ftnlen trana_len, ftnlen tranb_len)
{
/* System generated locals */
int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
i__3, i__4;
float r__1, r__2;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
float r_imag(complex *);
void r_cnjg(complex *, complex *);
/* Local variables */
static int j, k, l;
static complex a11;
static float db;
static complex x11;
static float da11;
static complex vec;
static float dum[1], eps, sgn, smin;
static complex suml, sumr;
/* Complex */ complex cdotc_(int *, complex *, int
*, complex *, int *);
extern int lsame_(char *, char *, ftnlen, ftnlen);
/* Complex */ complex cdotu_(int *, complex *, int
*, complex *, int *);
extern /* Subroutine */ int slabad_(float *, float *);
extern float clange_(char *, int *, int *, complex *,
int *, float *, ftnlen);
/* Complex */ complex cladiv_(complex *, complex *);
static float scaloc;
extern float slamch_(char *, ftnlen);
extern /* Subroutine */ int csscal_(int *, float *, complex *, int
*), xerbla_(char *, int *, ftnlen);
static float bignum;
static int notrna, notrnb;
static float smlnum;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
b_dim1 = *ldb;
b_offset = 1 + b_dim1;
b -= b_offset;
c_dim1 = *ldc;
c_offset = 1 + c_dim1;
c__ -= c_offset;
/* Function Body */
notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
*info = 0;
if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
*info = -1;
} else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
*info = -2;
} else if (*isgn != 1 && *isgn != -1) {
*info = -3;
} else if (*m < 0) {
*info = -4;
} else if (*n < 0) {
*info = -5;
} else if (*lda < max(1,*m)) {
*info = -7;
} else if (*ldb < max(1,*n)) {
*info = -9;
} else if (*ldc < max(1,*m)) {
*info = -11;
}
if (*info != 0) {
i__1 = -(*info);
xerbla_("CTRSY2", &i__1, (ftnlen)6);
return;
}
*scale = 1.f;
if (*m == 0 || *n == 0) {
return;
}
eps = slamch_("P", (ftnlen)1);
smlnum = slamch_("S", (ftnlen)1);
bignum = 1.f / smlnum;
slabad_(&smlnum, &bignum);
smlnum = smlnum * (float) (*m * *n) / eps;
bignum = 1.f / smlnum;
/* Computing MAX */
r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, (
ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n,
&b[b_offset], ldb, dum, (ftnlen)1);
smin = dmax(r__1,r__2);
sgn = (float) (*isgn);
if (notrna && notrnb) {
i__1 = *n;
for (l = 1; l <= i__1; ++l) {
for (k = *m; k >= 1; --k) {
i__2 = *m - k;
/* Computing MIN */
i__3 = k + 1;
/* Computing MIN */
i__4 = k + 1;
q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
min(i__4,*m) + l * c_dim1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__2 = l - 1;
q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
, &c__1);
sumr.r = q__1.r, sumr.i = q__1.i;
i__2 = k + l * c_dim1;
q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
i__2 = k + k * a_dim1;
i__3 = l + l * b_dim1;
q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i;
q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i;
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__2 = *n;
for (j = 1; j <= i__2; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L10: */
}
*scale *= scaloc;
}
i__2 = k + l * c_dim1;
c__[i__2].r = x11.r, c__[i__2].i = x11.i;
/* L20: */
}
/* L30: */
}
} else if (! notrna && notrnb) {
i__1 = *n;
for (l = 1; l <= i__1; ++l) {
i__2 = *m;
for (k = 1; k <= i__2; ++k) {
i__3 = k - 1;
q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
c_dim1 + 1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__3 = l - 1;
q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
, &c__1);
sumr.r = q__1.r, sumr.i = q__1.i;
i__3 = k + l * c_dim1;
q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
r_cnjg(&q__2, &a[k + k * a_dim1]);
i__3 = l + l * b_dim1;
q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L40: */
}
*scale *= scaloc;
}
i__3 = k + l * c_dim1;
c__[i__3].r = x11.r, c__[i__3].i = x11.i;
/* L50: */
}
/* L60: */
}
} else if (! notrna && ! notrnb) {
for (l = *n; l >= 1; --l) {
i__1 = *m;
for (k = 1; k <= i__1; ++k) {
i__2 = k - 1;
q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
c_dim1 + 1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__2 = *n - l;
/* Computing MIN */
i__3 = l + 1;
/* Computing MIN */
i__4 = l + 1;
q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
l + min(i__4,*n) * b_dim1], ldb);
sumr.r = q__1.r, sumr.i = q__1.i;
i__2 = k + l * c_dim1;
r_cnjg(&q__4, &sumr);
q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
i__2 = k + k * a_dim1;
i__3 = l + l * b_dim1;
q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i;
r_cnjg(&q__1, &q__2);
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__2 = *n;
for (j = 1; j <= i__2; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L70: */
}
*scale *= scaloc;
}
i__2 = k + l * c_dim1;
c__[i__2].r = x11.r, c__[i__2].i = x11.i;
/* L80: */
}
/* L90: */
}
} else if (notrna && ! notrnb) {
for (l = *n; l >= 1; --l) {
for (k = *m; k >= 1; --k) {
i__1 = *m - k;
/* Computing MIN */
i__2 = k + 1;
/* Computing MIN */
i__3 = k + 1;
q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
min(i__3,*m) + l * c_dim1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__1 = *n - l;
/* Computing MIN */
i__2 = l + 1;
/* Computing MIN */
i__3 = l + 1;
q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
l + min(i__3,*n) * b_dim1], ldb);
sumr.r = q__1.r, sumr.i = q__1.i;
i__1 = k + l * c_dim1;
r_cnjg(&q__4, &sumr);
q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
i__1 = k + k * a_dim1;
r_cnjg(&q__3, &b[l + l * b_dim1]);
q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i;
q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i;
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L100: */
}
*scale *= scaloc;
}
i__1 = k + l * c_dim1;
c__[i__1].r = x11.r, c__[i__1].i = x11.i;
/* L110: */
}
/* L120: */
}
}
return;
}

107
relapack/src/ctrtri.c Normal file
View File

@ -0,0 +1,107 @@
#include "relapack.h"
static void RELAPACK_ctrtri_rec(const char *, const char *, const int *,
float *, const int *, int *);
/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
*
* This routine is functionally equivalent to LAPACK's ctrtri.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html
* */
void RELAPACK_ctrtri(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int nounit = LAPACK(lsame)(diag, "N");
const int unit = LAPACK(lsame)(diag, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (!nounit && !unit)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CTRTRI", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleandiag = nounit ? 'N' : 'U';
// check for singularity
if (nounit) {
int i;
for (i = 0; i < *n; i++)
if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
*info = i;
return;
}
}
// Recursive kernel
RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
}
/** ctrtri's recursive compute kernel */
static void RELAPACK_ctrtri_rec(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_CTRTRI, 1)) {
// Unblocked
LAPACK(ctrti2)(uplo, diag, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = - A_BL * A_TL
BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
// A_BL = A_BR \ A_BL
BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TR = - A_TL * A_TR
BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
// A_TR = A_TR / A_BR
BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

227
relapack/src/dgbtrf.c Normal file
View File

@ -0,0 +1,227 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *,
const int *, double *, const int *, int *, double *, const int *, double *,
const int *, int *);
/** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's dgbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html
* */
void RELAPACK_dgbtrf(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kl < 0)
*info = -3;
else if (*ku < 0)
*info = -4;
else if (*ldAb < 2 * *kl + *ku + 1)
*info = -6;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DGBTRF", &minfo);
return;
}
// Constant
const double ZERO[] = { 0. };
// Result upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + kv;
// Zero upper diagonal fill-in elements
int i, j;
for (j = 0; j < *n; j++) {
double *const A_j = A + *ldA * j;
for (i = MAX(0, j - kv); i < j - *ku; i++)
A_j[i] = 0.;
}
// Allocate work space
const int n1 = DREC_SPLIT(*n);
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const int nWorkl = (kv > n1) ? n1 : kv;
const int mWorku = (*kl > n1) ? n1 : *kl;
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
double *Workl = malloc(mWorkl * nWorkl * sizeof(double));
double *Worku = malloc(mWorku * nWorku * sizeof(double));
LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
// Free work space
free(Workl);
free(Worku);
}
/** dgbtrf's recursive compute kernel */
static void RELAPACK_dgbtrf_rec(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
int *info
) {
if (*n <= MAX(CROSSOVER_DGBTRF, 1)) {
// Unblocked
LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
// Loop iterators
int i, j;
// Output upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + kv;
// Splitting
const int n1 = MIN(DREC_SPLIT(*n), *kl);
const int n2 = *n - n1;
const int m1 = MIN(n1, *m);
const int m2 = *m - m1;
const int mn1 = MIN(m1, n1);
const int mn2 = MIN(m2, n2);
// Ab_L *
// Ab_BR
double *const Ab_L = Ab;
double *const Ab_BR = Ab + *ldAb * n1;
// A_L A_R
double *const A_L = A;
double *const A_R = A + *ldA * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + m1;
double *const A_BR = A + *ldA * n1 + m1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// Banded splitting
const int n21 = MIN(n2, kv - n1);
const int n22 = MIN(n2 - n21, n1);
const int m21 = MIN(m2, *kl - m1);
const int m22 = MIN(m2 - m21, m1);
// n1 n21 n22
// m * A_Rl ARr
double *const A_Rl = A_R;
double *const A_Rr = A_R + *ldA * n21;
// n1 n21 n22
// m1 * A_TRl A_TRr
// m21 A_BLt A_BRtl A_BRtr
// m22 A_BLb A_BRbl A_BRbr
double *const A_TRl = A_TR;
double *const A_TRr = A_TR + *ldA * n21;
double *const A_BLt = A_BL;
double *const A_BLb = A_BL + m21;
double *const A_BRtl = A_BR;
double *const A_BRtr = A_BR + *ldA * n21;
double *const A_BRbl = A_BR + m21;
double *const A_BRbr = A_BR + *ldA * n21 + m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
// Workl = A_BLb
LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
// partially redo swaps in A_L
for (i = 0; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
else
BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
}
}
// apply pivots to A_Rl
LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
// apply pivots to A_Rr columnwise
for (j = 0; j < n22; j++) {
double *const A_Rrj = A_Rr + *ldA * j;
for (i = j; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
const double tmp = A_Rrj[i];
A_Rrj[i] = A_Rr[ip];
A_Rrj[ip] = tmp;
}
}
}
// A_TRl = A_TL \ A_TRl
BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// Worku = A_TRr
LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
// Worku = A_TL \ Worku
BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
// A_TRr = Worku
LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
// A_BRtl = A_BRtl - A_BLt * A_TRl
BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
// A_BRbl = A_BRbl - Workl * A_TRl
BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
// A_BRtr = A_BRtr - A_BLt * Worku
BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Workl * Worku
BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
// partially undo swaps in A_L
for (i = mn1 - 1; i >= 0; i--) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
else
BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
if (*info)
*info += n1;
// shift pivots
for (i = 0; i < mn2; i++)
ipiv_B[i] += n1;
}

165
relapack/src/dgemmt.c Normal file
View File

@ -0,0 +1,165 @@
#include "relapack.h"
static void RELAPACK_dgemmt_rec(const char *, const char *, const char *,
const int *, const int *, const double *, const double *, const int *,
const double *, const int *, const double *, double *, const int *);
static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *,
const int *, const int *, const double *, const double *, const int *,
const double *, const int *, const double *, double *, const int *);
/** DGEMMT computes a matrix-matrix product with general matrices but updates
* only the upper or lower triangular part of the result matrix.
*
* This routine performs the same operation as the BLAS routine
* dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
* but only updates the triangular part of C specified by uplo:
* If (*uplo == 'L'), only the lower triangular part of C is updated,
* otherwise the upper triangular part is updated.
* */
void RELAPACK_dgemmt(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
#if HAVE_XGEMMT
BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
#else
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int notransA = LAPACK(lsame)(transA, "N");
const int tranA = LAPACK(lsame)(transA, "T");
const int notransB = LAPACK(lsame)(transB, "N");
const int tranB = LAPACK(lsame)(transB, "T");
int info = 0;
if (!lower && !upper)
info = 1;
else if (!tranA && !notransA)
info = 2;
else if (!tranB && !notransB)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*ldA < MAX(1, notransA ? *n : *k))
info = 8;
else if (*ldB < MAX(1, notransB ? *k : *n))
info = 10;
else if (*ldC < MAX(1, *n))
info = 13;
if (info) {
LAPACK(xerbla)("DGEMMT", &info);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleantransA = notransA ? 'N' : 'T';
const char cleantransB = notransB ? 'N' : 'T';
// Recursive kernel
RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
#endif
}
/** dgemmt's recursive compute kernel */
static void RELAPACK_dgemmt_rec(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
if (*n <= MAX(CROSSOVER_DGEMMT, 1)) {
// Unblocked
RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
}
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_T
// A_B
const double *const A_T = A;
const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);
// B_L B_R
const double *const B_L = B;
const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);
// C_TL C_TR
// C_BL C_BR
double *const C_TL = C;
double *const C_TR = C + *ldC * n1;
double *const C_BL = C + n1;
double *const C_BR = C + *ldC * n1 + n1;
// recursion(C_TL)
RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
if (*uplo == 'L')
// C_BL = alpha A_B B_L + beta C_BL
BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
else
// C_TR = alpha A_T B_R + beta C_TR
BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
// recursion(C_BR)
RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
}
/** dgemmt's unblocked compute kernel */
static void RELAPACK_dgemmt_rec2(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
const int incB = (*transB == 'N') ? 1 : *ldB;
const int incC = 1;
int i;
for (i = 0; i < *n; i++) {
// A_0
// A_i
const double *const A_0 = A;
const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i);
// * B_i *
const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i);
// * C_0i *
// * C_ii *
double *const C_0i = C + *ldC * i;
double *const C_ii = C + *ldC * i + i;
if (*uplo == 'L') {
const int nmi = *n - i;
if (*transA == 'N')
BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
else
BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
} else {
const int ip1 = i + 1;
if (*transA == 'N')
BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
else
BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
}
}
}

117
relapack/src/dgetrf.c Normal file
View File

@ -0,0 +1,117 @@
#include "relapack.h"
static void RELAPACK_dgetrf_rec(const int *, const int *, double *,
const int *, int *, int *);
/** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's dgetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html
* */
void RELAPACK_dgetrf(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DGETRF", &minfo);
return;
}
const int sn = MIN(*m, *n);
RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
if (*m < *n) {
// Constants
const double ONE[] = { 1. };
const int iONE[] = { 1. };
// Splitting
const int rn = *n - *m;
// A_L A_R
const double *const A_L = A;
double *const A_R = A + *ldA * *m;
// A_R = apply(ipiv, A_R)
LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
// A_R = A_S \ A_R
BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
}
}
/** dgetrf's recursive compute kernel */
static void RELAPACK_dgetrf_rec(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
if (*n <= MAX(CROSSOVER_DGETRF, 1)) {
// Unblocked
LAPACK(dgetf2)(m, n, A, ldA, ipiv, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
const int m2 = *m - n1;
// A_L A_R
double *const A_L = A;
double *const A_R = A + *ldA * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// recursion(A_L, ipiv_T)
RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
// apply pivots to A_R
LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
// A_TR = A_TL \ A_TR
BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_BL * A_TR
BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
// recursion(A_BR, ipiv_B)
RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
if (*info)
*info += n1;
// apply pivots to A_BL
LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
// shift pivots
int i;
for (i = 0; i < n2; i++)
ipiv_B[i] += n1;
}

87
relapack/src/dlauum.c Normal file
View File

@ -0,0 +1,87 @@
#include "relapack.h"
static void RELAPACK_dlauum_rec(const char *, const int *, double *,
const int *, int *);
/** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
*
* This routine is functionally equivalent to LAPACK's dlauum.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html
* */
void RELAPACK_dlauum(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DLAUUM", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info);
}
/** dlauum's recursive compute kernel */
static void RELAPACK_dlauum_rec(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_DLAUUM, 1)) {
// Unblocked
LAPACK(dlauu2)(uplo, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1. };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info);
if (*uplo == 'L') {
// A_TL = A_TL + A_BL' * A_BL
BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
// A_BL = A_BR' * A_BL
BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TL = A_TL + A_TR * A_TR'
BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
// A_TR = A_TR * A_BR'
BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info);
}

157
relapack/src/dpbtrf.c Normal file
View File

@ -0,0 +1,157 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *,
double *, const int *, double *, const int *, int *);
/** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
*
* This routine is functionally equivalent to LAPACK's dpbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html
* */
void RELAPACK_dpbtrf(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kd < 0)
*info = -3;
else if (*ldAb < *kd + 1)
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DPBTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Constant
const double ZERO[] = { 0. };
// Allocate work space
const int n1 = DREC_SPLIT(*n);
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
double *Work = malloc(mWork * nWork * sizeof(double));
LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
// Recursive kernel
RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
// Free work space
free(Work);
}
/** dpbtrf's recursive compute kernel */
static void RELAPACK_dpbtrf_rec(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
double *Work, const int *ldWork,
int *info
){
if (*n <= MAX(CROSSOVER_DPBTRF, 1)) {
// Unblocked
LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
// Splitting
const int n1 = MIN(DREC_SPLIT(*n), *kd);
const int n2 = *n - n1;
// * *
// * Ab_BR
double *const Ab_BR = Ab + *ldAb * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
// Banded splitting
const int n21 = MIN(n2, *kd - n1);
const int n22 = MIN(n2 - n21, n1);
// n1 n21 n22
// n1 * A_TRl A_TRr
// n21 A_BLt A_BRtl A_BRtr
// n22 A_BLb A_BRbl A_BRbr
double *const A_TRl = A_TR;
double *const A_TRr = A_TR + *ldA * n21;
double *const A_BLt = A_BL;
double *const A_BLb = A_BL + n21;
double *const A_BRtl = A_BR;
double *const A_BRtr = A_BR + *ldA * n21;
double *const A_BRbl = A_BR + n21;
double *const A_BRbr = A_BR + *ldA * n21 + n21;
if (*uplo == 'L') {
// A_BLt = ABLt / A_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
// A_BRtl = A_BRtl - A_BLt * A_BLt'
BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
// Work = A_BLb
LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
// Work = Work / A_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
// A_BRbl = A_BRbl - Work * A_BLt'
BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
// A_BRbr = A_BRbr - Work * Work'
BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_BLb = Work
LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
} else {
// A_TRl = A_TL' \ A_TRl
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// A_BRtl = A_BRtl - A_TRl' * A_TRl
BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
// Work = A_TRr
LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
// Work = A_TL' \ Work
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
// A_BRtr = A_BRtr - A_TRl' * Work
BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Work' * Work
BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_TRr = Work
LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
}
// recursion(A_BR)
if (*kd > n1)
RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info);
else
RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
if (*info)
*info += n1;
}

92
relapack/src/dpotrf.c Normal file
View File

@ -0,0 +1,92 @@
#include "relapack.h"
static void RELAPACK_dpotrf_rec(const char *, const int *, double *,
const int *, int *);
/** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
*
* This routine is functionally equivalent to LAPACK's dpotrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
* */
void RELAPACK_dpotrf(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DPOTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info);
}
/** dpotrf's recursive compute kernel */
static void RELAPACK_dpotrf_rec(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_DPOTRF, 1)) {
// Unblocked
LAPACK(dpotf2)(uplo, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = A_BL / A_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
// A_BR = A_BR - A_BL * A_BL'
BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
} else {
// A_TR = A_TL' \ A_TR
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_TR' * A_TR
BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
}
// recursion(A_BR)
RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

Some files were not shown because too many files have changed in this diff Show More