Merge branch 'develop' into cgroups

2017-07-15 10:40:42 +02:00 · 2017-07-15 10:40:42 +02:00 · 7294fb1d9d
parent 88249ca5f7 31e086d6a6
commit 7294fb1d9d
169 changed files with 22837 additions and 840 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME}
 	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
 	COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
 	)
-	ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h)	
+
+	ADD_CUSTOM_TARGET(genconfig
+	ALL
+	DEPENDS openblas_config.h
+	)
 	add_dependencies(genconfig ${OpenBLAS_LIBNAME})

 	install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME}
 	message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")

 	ADD_CUSTOM_TARGET(genf77blas
+	ALL
 	COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
 	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
 	)
@ -255,11 +260,11 @@ if(NOT NO_CBLAS)
 	message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")

 	ADD_CUSTOM_TARGET(gencblas
+	ALL
 	COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
 	COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
 	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
 	)
-
 	add_dependencies(gencblas ${OpenBLAS_LIBNAME})

 	install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
--- a/25
+++ b/25
@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1)
 SUBDIRS	+= lapack
 endif

+RELA =
+ifeq ($(BUILD_RELAPACK), 1)
+RELA = re_lapack
+endif
+
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))

 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench

-.PHONY : all libs netlib test ctest shared install
-.NOTPARALLEL : all libs prof lapack-test install blas-test
+.PHONY : all libs netlib $(RELA) test ctest shared install
+.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test

-all :: libs netlib tests shared
+all :: libs netlib $(RELA) tests shared
 	@echo
 	@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
 	@echo
@ -215,6 +220,14 @@ ifndef NO_LAPACKE
 endif
 endif

+ifeq ($(NO_LAPACK), 1)
+re_lapack :
+
+else
+re_lapack :
+	@$(MAKE) -C relapack
+endif
+
 prof_lapack : lapack_prebuild
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof

@ -326,11 +339,7 @@ endif
 	@touch $(NETLIB_LAPACK_DIR)/make.inc
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
 	@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
+	@$(MAKE) -C relapack clean
 	@rm -f *.grd Makefile.conf_last config_last.h
 	@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
 	@echo Done.
-
-# Makefile debugging trick:
-# call print-VARIABLE to see the runtime value of any variable
-print-%:
-	@echo '$*=$($*)'
--- a/Makefile.arm
+++ b/Makefile.arm
@ -1,5 +1,4 @@
-#ifeq logical or
-ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
+ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
 ifeq ($(OSNAME), Android)
 CCOMMON_OPT += -mfpu=neon  -march=armv7-a
 FCOMMON_OPT += -mfpu=neon  -march=armv7-a
@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
 endif
 endif

-ifeq ($(CORE), ARMV7)
-ifeq ($(OSNAME), Android)
-ifeq ($(ARM_SOFTFP_ABI), 1)
-CCOMMON_OPT += -mfpu=neon -march=armv7-a
-FCOMMON_OPT += -mfpu=neon -march=armv7-a
-else
-CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
-FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
-endif
-else
-CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
-FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
-endif
-endif
-
 ifeq ($(CORE), ARMV6)
 CCOMMON_OPT += -mfpu=vfp -march=armv6
 FCOMMON_OPT += -mfpu=vfp -march=armv6
 endif

-
 ifeq ($(CORE), ARMV5)
-CCOMMON_OPT += -marm -march=armv5
-FCOMMON_OPT += -marm -march=armv5
+CCOMMON_OPT += -march=armv5
+FCOMMON_OPT += -march=armv5
 endif
--- a/Makefile.arm64
+++ b/Makefile.arm64
@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
 endif

 ifeq ($(CORE), THUNDERX2T99)
-CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
-FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
+CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
+FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
 endif
--- a/Makefile.rule
+++ b/Makefile.rule
@ -83,6 +83,9 @@ VERSION = 0.2.20.dev
 # Build LAPACK Deprecated functions since LAPACK 3.6.0
 BUILD_LAPACK_DEPRECATED = 1

+# Build RecursiveLAPACK on top of LAPACK
+# BUILD_RELAPACK = 1
+
 # If you want to use legacy threaded Level 3 implementation.
 # USE_SIMPLE_THREADED_LEVEL3 = 1

@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1
 NO_WARMUP = 1

 # If you want to disable CPU/Memory affinity on Linux.
-NO_AFFINITY = 1
+#NO_AFFINITY = 1

 # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
 # BIGNUMA = 1
--- a/Makefile.system
+++ b/Makefile.system
@ -242,6 +242,10 @@ EXTRALIB	+= -lm
 NO_EXPRECISION = 1
 endif

+ifeq ($(OSNAME), Android)
+EXTRALIB	+= -lm
+endif
+
 ifeq ($(OSNAME), AIX)
 EXTRALIB	+= -lm
 endif
@ -486,12 +490,18 @@ BINARY_DEFINED  = 1
 CCOMMON_OPT += -marm
 FCOMMON_OPT += -marm

+# If softfp abi is mentioned on the command line, force it.
 ifeq ($(ARM_SOFTFP_ABI), 1)
-CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
-FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
+CCOMMON_OPT += -mfloat-abi=softfp
+FCOMMON_OPT += -mfloat-abi=softfp
+endif
+
+ifeq ($(OSNAME), Android)
+ifeq ($(ARM_SOFTFP_ABI), 1)
+EXTRALIB	+= -lm
 else
-CCOMMON_OPT += -mfloat-abi=hard
-FCOMMON_OPT += -mfloat-abi=hard
+EXTRALIB	+= -Wl,-lm_hard
+endif
 endif
 endif

@ -1119,6 +1129,9 @@ LIB_COMPONENTS += LAPACK
 ifneq ($(NO_LAPACKE), 1)
 LIB_COMPONENTS += LAPACKE
 endif
+ifeq ($(BUILD_RELAPACK), 1)
+LIB_COMPONENTS += ReLAPACK
+endif
 endif

 ifeq ($(ONLY_CBLAS), 1)
--- a/cmake/c_check.cmake
+++ b/cmake/c_check.cmake
@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF}
  "#define __${BINARY}BIT__\t1\n"
  "#define FUNDERSCORE\t${FU}\n")

+if (${HOST_OS} STREQUAL "WINDOWSSTORE")
+  file(APPEND ${TARGET_CONF}
+    "#define OS_WINNT\t1\n")
+endif ()
+
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@ -77,7 +77,7 @@ if (CYGWIN)
  set(NO_EXPRECISION 1)
 endif ()

-if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
+if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
  if (SMP)
    set(EXTRALIB "${EXTRALIB} -lpthread")
  endif ()
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@ -72,20 +72,26 @@ if (MSVC)
 set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
 endif()

+if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+  # disable WindowsStore strict CRT checks
+  set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
+endif ()
+
 set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
 set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
 file(MAKE_DIRECTORY ${GETARCH_DIR})
-try_compile(GETARCH_RESULT ${GETARCH_DIR}
-  SOURCES ${GETARCH_SRC}
-  COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
-  OUTPUT_VARIABLE GETARCH_LOG
-  COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
-)
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+  try_compile(GETARCH_RESULT ${GETARCH_DIR}
+    SOURCES ${GETARCH_SRC}
+    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
+    OUTPUT_VARIABLE GETARCH_LOG
+    COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
+  )

-if (NOT ${GETARCH_RESULT})
-  MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
+  if (NOT ${GETARCH_RESULT})
+    MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
+  endif ()
 endif ()
-
 message(STATUS "Running getarch")

 # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
 set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
 set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
 file(MAKE_DIRECTORY ${GETARCH2_DIR})
-try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
-  SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
-  COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
-  OUTPUT_VARIABLE GETARCH2_LOG
-  COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
-)
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+  try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
+    SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
+    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
+    OUTPUT_VARIABLE GETARCH2_LOG
+    COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
+  )

-if (NOT ${GETARCH2_RESULT})
-  MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
+  if (NOT ${GETARCH2_RESULT})
+    MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
+  endif ()
 endif ()

 # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
 set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
 file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})

-try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
-  SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
-  COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
-  OUTPUT_VARIABLE GEN_CONFIG_H_LOG
-  COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
-)
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+  try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
+    SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
+    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
+    OUTPUT_VARIABLE GEN_CONFIG_H_LOG
+    COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
+  )

-if (NOT ${GEN_CONFIG_H_RESULT})
-  MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
-endif ()
+  if (NOT ${GEN_CONFIG_H_RESULT})
+    MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
+  endif ()
+endif ()
--- a/common.h
+++ b/common.h
@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #endif

 #ifndef ASSEMBLER
+#ifdef OS_WINDOWSSTORE
+typedef char env_var_t[MAX_PATH];
+#define readenv(p, n) 0
+#else
 #ifdef OS_WINDOWS
 typedef char env_var_t[MAX_PATH];
 #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
 typedef char* env_var_t;
 #define readenv(p, n) ((p)=getenv(n))
 #endif
+#endif

 #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
 #ifdef _POSIX_MONOTONIC_CLOCK
@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
  *address = 0;
 }

-
+#ifdef OS_WINDOWSSTORE
+static __inline int readenv_atoi(char *env) {
+	return 0;
+}
+#else
 #ifdef OS_WINDOWS
 static __inline int readenv_atoi(char *env) {
  env_var_t p;
@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
 	return(0);
 }
 #endif
-
+#endif

 #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)

--- a/common_arm.h
+++ b/common_arm.h
@ -111,11 +111,6 @@ REALNAME:

 #define PROFCODE

-#ifdef __ARM_PCS
-//-mfloat-abi=softfp
-#define SOFT_FLOAT_ABI
-#endif
-
 #endif


--- a/driver/level2/gbmv_thread.c
+++ b/driver/level2/gbmv_thread.c
@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
-  BLASLONG range_m[MAX_CPU_NUMBER];
+  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;
--- a/driver/level2/sbmv_thread.c
+++ b/driver/level2/sbmv_thread.c
@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
 #endif

  blas_arg_t args;
-  blas_queue_t queue[MAX_CPU_NUMBER];
+  blas_queue_t queue[MAX_CPU_NUMBER + 1];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];

--- a/driver/level2/spmv_thread.c
+++ b/driver/level2/spmv_thread.c
@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
-  BLASLONG range_n[MAX_CPU_NUMBER];
+  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

--- a/driver/level2/tbmv_thread.c
+++ b/driver/level2/tbmv_thread.c
@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
-  BLASLONG range_n[MAX_CPU_NUMBER];
+  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

--- a/driver/level2/tpmv_thread.c
+++ b/driver/level2/tpmv_thread.c
@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
-  BLASLONG range_n[MAX_CPU_NUMBER];
+  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

--- a/driver/level2/trmv_thread.c
+++ b/driver/level2/trmv_thread.c
@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
-  BLASLONG range_n[MAX_CPU_NUMBER];
+  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

--- a/driver/level3/syrk_thread.c
+++ b/driver/level3/syrk_thread.c
@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
      if (nthreads - num_cpu > 1) {

 	di = (double)i;
-	width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
+	width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);

 	if ((width <= 0) || (width > n_to - i)) width = n_to - i;

@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
      if (nthreads - num_cpu > 1) {

 	di = (double)(arg -> n - i);
-	width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
+	width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);

 	if ((width <= 0) || (width > n_to - i)) width = n_to - i;

--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@ -12,6 +12,8 @@ if (SMP)
    set(BLAS_SERVER blas_server_omp.c)
  elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
    set(BLAS_SERVER blas_server_win32.c)
+  elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
+    set(BLAS_SERVER blas_server_win32.c)
  endif ()

  if (NOT DEFINED BLAS_SERVER)
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
    SetEvent(pool.killed);

    for(i = 0; i < blas_num_threads - 1; i++){
-     WaitForSingleObject(blas_threads[i], 5);  //INFINITE);
-	 TerminateThread(blas_threads[i],0);
+      WaitForSingleObject(blas_threads[i], 5);  //INFINITE);
+#ifndef OS_WINDOWSSTORE
+// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
+      TerminateThread(blas_threads[i],0);
+#endif
    }

    blas_server_avail = 0;
--- a/driver/others/init.c
+++ b/driver/others/init.c
@ -825,10 +825,11 @@ void gotoblas_affinity_init(void) {

  common -> shmid = pshmid;

-  if (common -> magic != SH_MAGIC) {
+  if (common -> magic != SH_MAGIC) 
    cpu_set_t *cpusetp;
    int nums;
    int ret;
+
 #ifdef DEBUG
    fprintf(stderr, "Shared Memory Initialization.\n");
 #endif
@ -883,7 +884,7 @@ void gotoblas_affinity_init(void) {
    if (common -> num_nodes > 1) numa_mapping();

    common -> final_num_procs = 0;
-    for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1;   //Make the max cpu number.
+    for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1;   //Make the max cpu number. 

    for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] =  0;

--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@ -1,7 +1,5 @@
+include $(KERNELDIR)/KERNEL.ARMV5

-
-
-###############################################################################
 SAMAXKERNEL  = iamax_vfp.S
 DAMAXKERNEL  = iamax_vfp.S
 CAMAXKERNEL  = iamax_vfp.S
@ -44,10 +42,10 @@ DAXPYKERNEL  = axpy_vfp.S
 CAXPYKERNEL  = axpy_vfp.S
 ZAXPYKERNEL  = axpy_vfp.S

-SCOPYKERNEL  = copy.c
-DCOPYKERNEL  = copy.c
-CCOPYKERNEL  = zcopy.c
-ZCOPYKERNEL  = zcopy.c
+SROTKERNEL   = rot_vfp.S
+DROTKERNEL   = rot_vfp.S
+CROTKERNEL   = rot_vfp.S
+ZROTKERNEL   = rot_vfp.S

 SDOTKERNEL   = sdot_vfp.S
 DDOTKERNEL   = ddot_vfp.S
@ -59,16 +57,6 @@ DNRM2KERNEL  = nrm2_vfp.S
 CNRM2KERNEL  = nrm2_vfp.S
 ZNRM2KERNEL  = nrm2_vfp.S

-SROTKERNEL   = rot_vfp.S
-DROTKERNEL   = rot_vfp.S
-CROTKERNEL   = rot_vfp.S
-ZROTKERNEL   = rot_vfp.S
-
-SSCALKERNEL  = scal.c
-DSCALKERNEL  = scal.c
-CSCALKERNEL  = zscal.c
-ZSCALKERNEL  = zscal.c
-
 SSWAPKERNEL  = swap_vfp.S
 DSWAPKERNEL  = swap_vfp.S
 CSWAPKERNEL  = swap_vfp.S
@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S
 CGEMVTKERNEL = cgemv_t_vfp.S
 ZGEMVTKERNEL = zgemv_t_vfp.S

-STRMMKERNEL	= strmm_kernel_4x2_vfp.S
-DTRMMKERNEL	= dtrmm_kernel_4x2_vfp.S
-CTRMMKERNEL	= ctrmm_kernel_2x2_vfp.S
-ZTRMMKERNEL	= ztrmm_kernel_2x2_vfp.S
-
 SGEMMKERNEL    = sgemm_kernel_4x2_vfp.S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
 SGEMMITCOPY    = sgemm_tcopy_4_vfp.S
 SGEMMINCOPYOBJ = sgemm_incopy.o
 SGEMMITCOPYOBJ = sgemm_itcopy.o
+endif
 SGEMMONCOPY    = sgemm_ncopy_2_vfp.S
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+SGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o

 DGEMMKERNEL    = dgemm_kernel_4x2_vfp.S
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
 DGEMMINCOPY    = dgemm_ncopy_4_vfp.S
 DGEMMITCOPY    = dgemm_tcopy_4_vfp.S
 DGEMMINCOPYOBJ = dgemm_incopy.o
 DGEMMITCOPYOBJ = dgemm_itcopy.o
+endif
 DGEMMONCOPY    = dgemm_ncopy_2_vfp.S
 DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
 DGEMMONCOPYOBJ = dgemm_oncopy.o
@ -121,26 +108,8 @@ ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
 ZGEMMONCOPYOBJ = zgemm_oncopy.o
 ZGEMMOTCOPYOBJ = zgemm_otcopy.o

-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-
-
+STRMMKERNEL    = strmm_kernel_4x2_vfp.S
+DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
+CTRMMKERNEL    = ctrmm_kernel_2x2_vfp.S
+ZTRMMKERNEL    = ztrmm_kernel_2x2_vfp.S

--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@ -1,91 +1,12 @@
-
-#################################################################################
-SAMAXKERNEL  = iamax_vfp.S
-DAMAXKERNEL  = iamax_vfp.S
-CAMAXKERNEL  = iamax_vfp.S
-ZAMAXKERNEL  = iamax_vfp.S
-
-SAMINKERNEL  = iamax_vfp.S
-DAMINKERNEL  = iamax_vfp.S
-CAMINKERNEL  = iamax_vfp.S
-ZAMINKERNEL  = iamax_vfp.S
-
-SMAXKERNEL   = iamax_vfp.S
-DMAXKERNEL   = iamax_vfp.S
-
-SMINKERNEL   = iamax_vfp.S
-DMINKERNEL   = iamax_vfp.S
-
-ISAMAXKERNEL = iamax_vfp.S
-IDAMAXKERNEL = iamax_vfp.S
-ICAMAXKERNEL = iamax_vfp.S
-IZAMAXKERNEL = iamax_vfp.S
-
-ISAMINKERNEL = iamax_vfp.S
-IDAMINKERNEL = iamax_vfp.S
-ICAMINKERNEL = iamax_vfp.S
-IZAMINKERNEL = iamax_vfp.S
-
-ISMAXKERNEL  = iamax_vfp.S
-IDMAXKERNEL  = iamax_vfp.S
-
-ISMINKERNEL  = iamax_vfp.S
-IDMINKERNEL  = iamax_vfp.S
-
-SSWAPKERNEL  = swap_vfp.S
-DSWAPKERNEL  = swap_vfp.S
-CSWAPKERNEL  = swap_vfp.S
-ZSWAPKERNEL  = swap_vfp.S
-
-SASUMKERNEL  = asum_vfp.S
-DASUMKERNEL  = asum_vfp.S
-CASUMKERNEL  = asum_vfp.S
-ZASUMKERNEL  = asum_vfp.S
-
-SAXPYKERNEL  = axpy_vfp.S
-DAXPYKERNEL  = axpy_vfp.S
-CAXPYKERNEL  = axpy_vfp.S
-ZAXPYKERNEL  = axpy_vfp.S
-
-SCOPYKERNEL  = copy.c
-DCOPYKERNEL  = copy.c
-CCOPYKERNEL  = zcopy.c
-ZCOPYKERNEL  = zcopy.c
-
-SDOTKERNEL   = sdot_vfp.S
-DDOTKERNEL   = ddot_vfp.S
-CDOTKERNEL   = cdot_vfp.S
-ZDOTKERNEL   = zdot_vfp.S
+include $(KERNELDIR)/KERNEL.ARMV6

 SNRM2KERNEL  = nrm2_vfpv3.S
 DNRM2KERNEL  = nrm2_vfpv3.S
 CNRM2KERNEL  = nrm2_vfpv3.S
 ZNRM2KERNEL  = nrm2_vfpv3.S

-SROTKERNEL   = rot_vfp.S
-DROTKERNEL   = rot_vfp.S
-CROTKERNEL   = rot_vfp.S
-ZROTKERNEL   = rot_vfp.S
-
-SSCALKERNEL  = scal.c
-DSCALKERNEL  = scal.c
-CSCALKERNEL  = zscal.c
-ZSCALKERNEL  = zscal.c
-
 SGEMVNKERNEL = gemv_n_vfpv3.S
 DGEMVNKERNEL = gemv_n_vfpv3.S
-CGEMVNKERNEL = cgemv_n_vfp.S
-ZGEMVNKERNEL = zgemv_n_vfp.S
-
-SGEMVTKERNEL = gemv_t_vfp.S
-DGEMVTKERNEL = gemv_t_vfp.S
-CGEMVTKERNEL = cgemv_t_vfp.S
-ZGEMVTKERNEL = zgemv_t_vfp.S
-
-STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
-DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
-CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
-ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S

 SGEMMKERNEL    =  sgemm_kernel_4x4_vfpv3.S
 SGEMMONCOPY    =  sgemm_ncopy_4_vfp.S
@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o

 CGEMMKERNEL    = cgemm_kernel_2x2_vfpv3.S
-CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
-CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
-CGEMMONCOPYOBJ = cgemm_oncopy.o
-CGEMMOTCOPYOBJ = cgemm_otcopy.o
-
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfpv3.S
-ZGEMMONCOPY    = zgemm_ncopy_2_vfp.S
-ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
-ZGEMMONCOPYOBJ = zgemm_oncopy.o
-ZGEMMOTCOPYOBJ = zgemm_otcopy.o
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c

+STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
+DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
+CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
+ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S

--- a/kernel/arm/asum_vfp.S
+++ b/kernel/arm/asum_vfp.S
@ -475,6 +475,14 @@ asum_kernel_L999:
 	vadd.f32	s0 , s0, s1				// set return value
 #endif

+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	r0, s0
+#else
+	vmov	r0, r1, d0
+#endif
+#endif
+
 	bx	lr

 	EPILOGUE
--- a/kernel/arm/axpy_vfp.S
+++ b/kernel/arm/axpy_vfp.S
@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

-#ifndef ARM_SOFTFP_ABI
-//hard	
-#define	OLD_INC_X	[fp, #0 ]
-#define	OLD_Y		[fp, #4 ]
-#define	OLD_INC_Y	[fp, #8 ]
-#else
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(COMPLEX)
+
+#if !defined(DOUBLE)
+#define	OLD_ALPHA	r3
 #define	OLD_X		[fp, #0 ]
 #define	OLD_INC_X	[fp, #4 ]
 #define	OLD_Y		[fp, #8 ]
 #define	OLD_INC_Y	[fp, #12 ]
+#else
+#define	OLD_ALPHA	[fp, #0]
+#define	OLD_X		[fp, #8 ]
+#define	OLD_INC_X	[fp, #12 ]
+#define	OLD_Y		[fp, #16 ]
+#define	OLD_INC_Y	[fp, #20 ]
 #endif
-	
+
+#else //COMPLEX
+
+#if !defined(DOUBLE)
+#define	OLD_ALPHAR	r3
+#define	OLD_ALPHAI	[fp, #0 ]
+#define	OLD_X		[fp, #4 ]
+#define	OLD_INC_X	[fp, #8 ]
+#define	OLD_Y		[fp, #12 ]
+#define	OLD_INC_Y	[fp, #16 ]
+#else
+#define	OLD_ALPHAR	[fp, #0]
+#define	OLD_ALPHAI	[fp, #8]
+#define	OLD_X		[fp, #16 ]
+#define	OLD_INC_X	[fp, #20 ]
+#define	OLD_Y		[fp, #24 ]
+#define	OLD_INC_Y	[fp, #28 ]
+#endif
+
+#endif //!defined(COMPLEX)
+
+#else //__ARM_PCS_VFP
+
+#define	OLD_INC_X	[fp, #0 ]
+#define	OLD_Y		[fp, #4 ]
+#define	OLD_INC_Y	[fp, #8 ]
+
+#endif //!defined(__ARM_PCS_VFP)
+
 #define	N	r0
 #define Y	r1
 #define	INC_X	r2
@ -71,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(DOUBLE)

 #define	FMAC_R1	fmacd
-#define FMAC_R2 fnmacd
+#define FMAC_R2 vmls.f64
 #define	FMAC_I1	fmacd
 #define FMAC_I2 fmacd

 #else

 #define	FMAC_R1	fmacs
-#define FMAC_R2 fnmacs
+#define FMAC_R2 vmls.f32
 #define	FMAC_I1	fmacs
 #define FMAC_I2 fmacs

@ -90,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define	FMAC_R1	fmacd
 #define FMAC_R2 fmacd
-#define	FMAC_I1	fnmacd
+#define	FMAC_I1	vmls.f64
 #define FMAC_I2 fmacd

 #else

 #define	FMAC_R1	fmacs
 #define FMAC_R2 fmacs
-#define	FMAC_I1	fnmacs
+#define	FMAC_I1	vmls.f32
 #define FMAC_I2 fmacs

 #endif
@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        add     fp, sp, #8
 	sub     sp, sp, #STACKSIZE                              // reserve stack

-#ifdef ARM_SOFTFP_ABI
-#ifndef DOUBLE
-	vmov	s0, r3     //move alpha to s0
+#if !defined(__ARM_PCS_VFP)
+#if !defined(COMPLEX)
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHA
+	ldr	X, OLD_X
+#else
+	vldr	d0, OLD_ALPHA
 	ldr	X, OLD_X
 #endif
+#else //COMPLEX
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHAR
+	vldr	s1, OLD_ALPHAI
+	ldr	X, OLD_X
+#else
+	vldr	d0, OLD_ALPHAR
+	vldr	d1, OLD_ALPHAI
+	ldr     X, OLD_X
 #endif
-	
+#endif
+#endif
+
 	ldr    INC_X , OLD_INC_X
 	ldr         Y, OLD_Y
 	ldr    INC_Y , OLD_INC_Y
--- a/kernel/arm/cdot_vfp.S
+++ b/kernel/arm/cdot_vfp.S
@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
-#define	OLD_Y	r3
-

 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * registers
 *******************************************************/

-#define OLD_INC_Y	[fp, #4 ]
+#if !defined(__ARM_PCS_VFP)
+#define	OLD_RETURN_ADDR	r0
+#define	OLD_N		r1
+#define	OLD_X		r2
+#define	OLD_INC_X	r3
+#define	OLD_Y		[fp, #0 ]
+#define	OLD_INC_Y	[fp, #4 ]
+#define	RETURN_ADDR	r8
+#else
+#define	OLD_Y		r3
+#define	OLD_INC_Y	[fp, #0 ]
+#endif

 #define I	r5
 #define Y	r6
@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.align 5

 	push	{r4 - r9, fp}
-	add	fp, sp, #24
+	add	fp, sp, #28
 	sub	sp, sp, #STACKSIZE				// reserve stack

 	sub	r4, fp, #128
@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmov	s2, s0
 	vmov	s3, s0

+#if !defined(__ARM_PCS_VFP)
+	mov	RETURN_ADDR, OLD_RETURN_ADDR
+	mov	N, OLD_N
+	mov	X, OLD_X
+	mov	INC_X, OLD_INC_X
+	ldr	Y, OLD_Y
+	ldr	INC_Y, OLD_INC_Y
+#else
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
+#endif

 	cmp	N, #0
 	ble	cdot_kernel_L999
@ -265,7 +283,6 @@ cdot_kernel_S10:


 cdot_kernel_L999:
-
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers

@ -276,8 +293,11 @@ cdot_kernel_L999:
 	vadd.f32	s0 , s0, s2
 	vsub.f32	s1 , s1, s3
 #endif
+#if !defined(__ARM_PCS_VFP)
+	vstm	RETURN_ADDR, {s0 - s1}
+#endif

-	sub	sp, fp, #24
+	sub	sp, fp, #28
 	pop	{r4 - r9, fp}
 	bx	lr

--- a/kernel/arm/cgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfp.S
@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	r3
+#define OLD_ALPHAI_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define B	[fp, #12 ]
+#define C	[fp, #16 ]
+#define OLD_LDC	[fp, #20 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif

 #define I	r0
 #define J	r1
@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)

-	#define	KMAC_R	fnmacs
+	#define	KMAC_R	vmls.f32
 	#define	KMAC_I	fmacs

 	#define	FMAC_R1	fmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs

 #elif defined(CN) || defined(CT)

 	#define	KMAC_R	fmacs
-	#define	KMAC_I	fnmacs
+	#define	KMAC_I	vmls.f32

 	#define	FMAC_R1	fmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs

 #elif defined(NC) || defined(TC)

 	#define	KMAC_R	fmacs
-	#define	KMAC_I	fnmacs
+	#define	KMAC_I	vmls.f32

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
+	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs

 #else

-	#define	KMAC_R  fnmacs
+	#define	KMAC_R  vmls.f32
 	#define	KMAC_I	fmacs

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
+	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs

 #endif
@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	r3
+#define OLD_ALPHAI_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define B	[fp, #12 ]
+#define C	[fp, #16 ]
+#define OLD_LDC	[fp, #20 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif

 #define I	r0
 #define J	r1
@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubs
 	#define	FADD_I	fadds

-	#define	FMAC_R1	fnmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R1	vmls.f32
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
-	#define	FMAC_I2	fnmacs
+	#define	FMAC_I2	vmls.f32

 #elif defined(CN) || defined(CT)

@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
+	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs

 #elif defined(NC) || defined(TC)
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubs

 	#define	FMAC_R1	fmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs

@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubs
 	#define	FADD_I	fadds

-	#define	FMAC_R1	fnmacs
+	#define	FMAC_R1	vmls.f32
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
-	#define	FMAC_I2	fnmacs
+	#define	FMAC_I1	vmls.f32
+	#define	FMAC_I2	vmls.f32

 #endif

@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/cgemv_n_vfp.S
+++ b/kernel/arm/cgemv_n_vfp.S
@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR	r3
+#define OLD_ALPHAI	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #4 ]
+#define OLD_LDA		[fp, #8 ]
+#define X		[fp, #12 ]
+#define OLD_INC_X	[fp, #16 ]
+#define Y		[fp, #20 ]
+#define OLD_INC_Y	[fp, #24 ]
+#else
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+#endif
+
 #define OLD_A		r3
 #define	OLD_M		r0

@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if !defined(CONJ) && !defined(XCONJ)

-        #define KMAC_R  fnmacs
+        #define KMAC_R  vmls.f32
        #define KMAC_I  fmacs

        #define FMAC_R1 fmacs
-        #define FMAC_R2 fnmacs
+        #define FMAC_R2 vmls.f32
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs

 #elif defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fmacs
-        #define KMAC_I  fnmacs
+        #define KMAC_I  vmls.f32

        #define FMAC_R1 fmacs
-        #define FMAC_R2 fnmacs
+        #define FMAC_R2 vmls.f32
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs

 #elif !defined(CONJ) && defined(XCONJ)

        #define KMAC_R  fmacs
-        #define KMAC_I  fnmacs
+        #define KMAC_I  vmls.f32

        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
-        #define FMAC_I1 fnmacs
+        #define FMAC_I1 vmls.f32
        #define FMAC_I2 fmacs

 #else

-        #define KMAC_R  fnmacs
+        #define KMAC_R  vmls.f32
        #define KMAC_I  fmacs

        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
-        #define FMAC_I1 fnmacs
+        #define FMAC_I1 vmls.f32
        #define FMAC_I2 fmacs

 #endif
@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	cgemvn_kernel_L999

+#if !defined(__ARM_PCS_VFP)
+	vmov	s0, OLD_ALPHAR
+	vldr	s1, OLD_ALPHAI
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_M, M
 	vstr    s0 , ALPHA_R
--- a/kernel/arm/cgemv_t_vfp.S
+++ b/kernel/arm/cgemv_t_vfp.S
@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR	r3
+#define OLD_ALPHAI	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #4 ]
+#define OLD_LDA		[fp, #8 ]
+#define X		[fp, #12 ]
+#define OLD_INC_X	[fp, #16 ]
+#define Y		[fp, #20 ]
+#define OLD_INC_Y	[fp, #24 ]
+#else
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+#endif
+
 #define OLD_A		r3
 #define	OLD_N		r1

@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if !defined(CONJ) && !defined(XCONJ)

-        #define KMAC_R  fnmacs
+        #define KMAC_R  vmls.f32
        #define KMAC_I  fmacs

        #define FMAC_R1 fmacs
-        #define FMAC_R2 fnmacs
+        #define FMAC_R2 vmls.f32
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs

 #elif defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fmacs
-        #define KMAC_I  fnmacs
+        #define KMAC_I  vmls.f32

        #define FMAC_R1 fmacs
-        #define FMAC_R2 fnmacs
+        #define FMAC_R2 vmls.f32
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs

 #elif !defined(CONJ) && defined(XCONJ)

        #define KMAC_R  fmacs
-        #define KMAC_I  fnmacs
+        #define KMAC_I  vmls.f32

        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
-        #define FMAC_I1 fnmacs
+        #define FMAC_I1 vmls.f32
        #define FMAC_I2 fmacs

 #else

-        #define KMAC_R  fnmacs
+        #define KMAC_R  vmls.f32
        #define KMAC_I  fmacs

        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
-        #define FMAC_I1 fnmacs
+        #define FMAC_I1 vmls.f32
        #define FMAC_I2 fmacs

 #endif
@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	cgemvt_kernel_L999

+#if !defined(__ARM_PCS_VFP)
+	vmov	s0, OLD_ALPHAR
+	vldr	s1, OLD_ALPHAI
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_N, N

--- a/kernel/arm/ctrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S
@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	r3
+#define OLD_ALPHAI_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define B	[fp, #12 ]
+#define C	[fp, #16 ]
+#define OLD_LDC	[fp, #20 ]
+#define OFFSET  [fp, #24 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif

 #define I	r0
 #define J	r1
@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)

-	#define	KMAC_R	fnmacs
+	#define	KMAC_R	vmls.f32
 	#define	KMAC_I	fmacs

 	#define	FMAC_R1	fmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs

 #elif defined(CN) || defined(CT)

 	#define	KMAC_R	fmacs
-	#define	KMAC_I	fnmacs
+	#define	KMAC_I	vmls.f32

 	#define	FMAC_R1	fmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs

 #elif defined(NC) || defined(TC)

 	#define	KMAC_R	fmacs
-	#define	KMAC_I	fnmacs
+	#define	KMAC_I	vmls.f32

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
+	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs

 #else

-	#define	KMAC_R  fnmacs
+	#define	KMAC_R  vmls.f32
 	#define	KMAC_I	fmacs

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
+	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs

 #endif
@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	r3
+#define OLD_ALPHAI_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define B	[fp, #12 ]
+#define C	[fp, #16 ]
+#define OLD_LDC	[fp, #20 ]
+#define OFFSET  [fp, #24 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif

 #define I	r0
 #define J	r1
@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubs
 	#define	FADD_I	fadds

-	#define	FMAC_R1	fnmuls
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R1	vnmul.f32
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmuls
-	#define	FMAC_I2	fnmacs
+	#define	FMAC_I2	vmls.f32

 #elif defined(CN) || defined(CT)

@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	#define	FMAC_R1	fmuls
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmuls
+	#define	FMAC_I1	vnmul.f32
 	#define	FMAC_I2	fmacs

 #elif defined(NC) || defined(TC)
@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubs

 	#define	FMAC_R1	fmuls
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmuls
 	#define	FMAC_I2	fmacs

@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubs
 	#define	FADD_I	fadds

-	#define	FMAC_R1	fnmuls
+	#define	FMAC_R1	vnmul.f32
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmuls
-	#define	FMAC_I2	fnmacs
+	#define	FMAC_I1	vnmul.f32
+	#define	FMAC_I2	vmls.f32

 #endif

@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/ddot_vfp.S
+++ b/kernel/arm/ddot_vfp.S
@ -246,6 +246,9 @@ ddot_kernel_L999:
 	vldm	r3, { d8 - d15}					// restore floating point registers

 	vadd.f64	d0 , d0, d1				// set return value
+#if !defined(__ARM_PCS_VFP)
+	vmov	r0, r1, d0
+#endif
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
--- a/kernel/arm/dgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/dgemm_kernel_4x2_vfp.S
@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-280]

-
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #12 ]
+#define B	[fp, #16 ]
+#define C	[fp, #20 ]
+#define OLD_LDC	[fp, #24 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif

 #define I	r0
 #define J	r1
@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S
@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #12 ]
+#define B	[fp, #16 ]
+#define C	[fp, #20 ]
+#define OLD_LDC	[fp, #24 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif

 #define I	r0
 #define J	r1
@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/dtrmm_kernel_4x2_vfp.S
+++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S
@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-276 ]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #12 ]
+#define B	[fp, #16 ]
+#define OLD_C	[fp, #20 ]
+#define OLD_LDC	[fp, #24 ]
+#define OFFSET  [fp, #28 ]
+#else
 #define B	[fp, #4 ]
 #define OLD_C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif

 #define I	r0
 #define J	r1
@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-276 ]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #12 ]
+#define B	[fp, #16 ]
+#define OLD_C	[fp, #20 ]
+#define OLD_LDC	[fp, #24 ]
+#define OFFSET  [fp, #28 ]
+#else
 #define B	[fp, #4 ]
 #define OLD_C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif

 #define I	r0
 #define J	r1
@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/gemv_n_vfp.S
+++ b/kernel/arm/gemv_n_vfp.S
@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
+#define OLD_ALPHA	r3
+#define OLD_A_SOFTFP	[fp, #0 ]
+#define OLD_LDA		[fp, #4 ]
+#define X		[fp, #8 ]
+#define OLD_INC_X	[fp, #12 ]
+#define Y		[fp, #16 ]
+#define OLD_INC_Y	[fp, #20 ]
+#else
+#define OLD_ALPHA	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define OLD_LDA		[fp, #12]
+#define X		[fp, #16]
+#define OLD_INC_X	[fp, #20]
+#define Y		[fp, #24]
+#define OLD_INC_Y	[fp, #28]
+#endif
+
+#else
+
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+
+#endif
+
 #define OLD_A		r3
 #define	OLD_M		r0

@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	gemvn_kernel_L999

+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHA
+#else
+	vldr	d0, OLD_ALPHA
+#endif
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_M, M

--- a/kernel/arm/gemv_n_vfpv3.S
+++ b/kernel/arm/gemv_n_vfpv3.S
@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

-#ifndef ARM_SOFTFP_ABI
-//hard
-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
-#define OLD_A		r3
-#else
-#define OLD_A_SOFTFP	[fp, #0 ]
-#define	OLD_LDA		[fp, #4 ]
-#define	X		[fp, #8 ]
-#define	OLD_INC_X	[fp, #12 ]
-#define	Y		[fp, #16 ]
-#define	OLD_INC_Y	[fp, #20 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
 #define OLD_ALPHA	r3
-#define OLD_A		r3
+#define OLD_A_SOFTFP	[fp, #0 ]
+#define OLD_LDA		[fp, #4 ]
+#define X		[fp, #8 ]
+#define OLD_INC_X	[fp, #12 ]
+#define Y		[fp, #16 ]
+#define OLD_INC_Y	[fp, #20 ]
+#else
+#define OLD_ALPHA	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define OLD_LDA		[fp, #12]
+#define X		[fp, #16]
+#define OLD_INC_X	[fp, #20]
+#define Y		[fp, #24]
+#define OLD_INC_Y	[fp, #28]
 #endif

+#else
+
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+
+#endif
+
+#define OLD_A		r3
 #define	OLD_M		r0

 #define AO1	r0
@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	gemvn_kernel_L999

-#ifndef DOUBLE
-#ifdef ARM_SOFTFP_ABI
-
-	vmov	s0, 	OLD_ALPHA
-	ldr	OLD_A,	OLD_A_SOFTFP
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHA
+#else
+	vldr	d0, OLD_ALPHA
 #endif
+	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_A, A
 	str	OLD_M, M
-	
-	
+
 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y

--- a/kernel/arm/gemv_t_vfp.S
+++ b/kernel/arm/gemv_t_vfp.S
@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

-#ifndef ARM_SOFTFP_ABI
-//hard abi	
-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
-#define OLD_A		r3
-#else
-#define OLD_A_SOFTFP	[fp, #0 ]
-#define	OLD_LDA		[fp, #4 ]
-#define	X		[fp, #8 ]
-#define	OLD_INC_X	[fp, #12 ]
-#define	Y		[fp, #16 ]
-#define	OLD_INC_Y	[fp, #20 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
 #define OLD_ALPHA	r3
-#define OLD_A		r3
+#define OLD_A_SOFTFP	[fp, #0 ]
+#define OLD_LDA		[fp, #4 ]
+#define X		[fp, #8 ]
+#define OLD_INC_X	[fp, #12 ]
+#define Y		[fp, #16 ]
+#define OLD_INC_Y	[fp, #20 ]
+#else
+#define OLD_ALPHA	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define OLD_LDA		[fp, #12]
+#define X		[fp, #16]
+#define OLD_INC_X	[fp, #20]
+#define Y		[fp, #24]
+#define OLD_INC_Y	[fp, #28]
 #endif

+#else
+
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+
+#endif
+
+#define OLD_A		r3
 #define	OLD_N		r1

 #define M	r0
@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	gemvt_kernel_L999

-#ifndef DOUBLE
-#ifdef ARM_SOFTFP_ABI
-	vmov	s0,	OLD_ALPHA
-	ldr	OLD_A,	OLD_A_SOFTFP
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHA
+#else
+	vldr	d0, OLD_ALPHA
 #endif
+	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_A, A
--- a/kernel/arm/gemv_t_vfpv3.S
+++ b/kernel/arm/gemv_t_vfpv3.S
@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
+#define OLD_ALPHA	r3
+#define OLD_A_SOFTFP	[fp, #0 ]
+#define OLD_LDA		[fp, #4 ]
+#define X		[fp, #8 ]
+#define OLD_INC_X	[fp, #12 ]
+#define Y		[fp, #16 ]
+#define OLD_INC_Y	[fp, #20 ]
+#else
+#define OLD_ALPHA	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define OLD_LDA		[fp, #12]
+#define X		[fp, #16]
+#define OLD_INC_X	[fp, #20]
+#define Y		[fp, #24]
+#define OLD_INC_Y	[fp, #28]
+#endif
+
+#else
+
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+
+#endif
+
 #define OLD_A		r3
 #define	OLD_N		r1

@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	gemvt_kernel_L999

+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHA
+#else
+	vldr	d0, OLD_ALPHA
+#endif
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_N, N

--- a/kernel/arm/nrm2_vfp.S
+++ b/kernel/arm/nrm2_vfp.S
@ -573,6 +573,13 @@ nrm2_kernel_L999:
 #else
 	vsqrt.f32	s1, s1
 	vmul.f32	s0, s0, s1
+#endif
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	r0, s0
+#else
+	vmov	r0, r1, d0
+#endif
 #endif

 	bx	lr
--- a/kernel/arm/nrm2_vfpv3.S
+++ b/kernel/arm/nrm2_vfpv3.S
@ -503,8 +503,13 @@ nrm2_kernel_L999:
 #else
 	vsqrt.f32	s1, s1
 	vmul.f32	s0, s0, s1
-#ifdef ARM_SOFTFP_ABI
-	vmov		r0, s0
+#endif
+
+#if !defined(__ARM_PCS_VFP)
+#if defined(DOUBLE)
+	vmov	r0, r1, d0
+#else
+	vmov	r0, s0
 #endif
 #endif

--- a/kernel/arm/rot_vfp.S
+++ b/kernel/arm/rot_vfp.S
@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define	OLD_INC_Y	[fp, #0 ]

+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+#define	OLD_C		[fp, #4]
+#define	OLD_S		[fp, #8]
+#else
+#define	OLD_C		[fp, #8]
+#define	OLD_S		[fp, #16]
+#endif
+#endif

 #define	N	r0
 #define X	r1
@ -73,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@ -82,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@ -91,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@ -100,7 +109,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@ -114,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X, { d2 }
 	fstmiad	Y, { d3 }

@ -145,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@ -154,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@ -163,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@ -172,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@ -186,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@ -199,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X, { s2 }
 	fstmias	Y, { s3 }

@ -226,13 +235,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@ -241,13 +250,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@ -259,13 +268,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@ -274,13 +283,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@ -294,13 +303,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@ -314,13 +323,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	vstr	    d2 , [ X, #0 ]
 	vstr	    d3 , [ Y, #0 ]
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	vstr	    d2 , [ X, #8 ]
 	vstr	    d3 , [ Y, #8 ]

@ -343,13 +352,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@ -358,13 +367,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@ -376,13 +385,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@ -391,13 +400,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@ -411,13 +420,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@ -431,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	vstr	    s2 , [ X, #0 ]
 	vstr	    s3 , [ Y, #0 ]
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	vstr	    s2 , [ X, #4 ]
 	vstr	    s3 , [ Y, #4 ]

@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        add     fp, sp, #8

 	ldr    INC_Y , OLD_INC_Y
-
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vldr	s0, OLD_C
+	vldr	s1, OLD_S
+#else
+	vldr	d0, OLD_C
+	vldr	d1, OLD_S
+#endif
+#endif

 	cmp	N, #0
 	ble	rot_kernel_L999
--- a/kernel/arm/scal_vfp.S
+++ b/kernel/arm/scal_vfp.S
@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X, { d2 - d3 }
@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X, { s2 - s3 }
--- a/kernel/arm/sdot_vfp.S
+++ b/kernel/arm/sdot_vfp.S
@ -329,20 +329,19 @@ sdot_kernel_L999:
 	vldm	r3, { s8 - s15}					// restore floating point registers

 #if	defined(DSDOT)
-
 	vadd.f64	d0 , d0, d1				// set return value
-
-#ifdef ARM_SOFTFP_ABI
-	vmov	r0, r1, d0
+#else
+	vadd.f32	s0 , s0, s1				// set return value
 #endif

+#if !defined(__ARM_PCS_VFP)
+#if defined(DSDOT)
+	vmov	r0, r1, d0
 #else
-
-	vadd.f32	s0 , s0, s1				// set return value
-#ifdef ARM_SOFTFP_ABI
 	vmov	r0, s0
 #endif
 #endif
+
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
--- a/kernel/arm/sgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/sgemm_kernel_4x2_vfp.S
@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	r3
+#define OLD_A_SOFTFP	[fp, #4 ]
+#define B	[fp, #8 ]
+#define C	[fp, #12 ]
+#define OLD_LDC	[fp, #16 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif

 #define I	r0
 #define J	r1
@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S
@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_K	r2
-
-#ifdef ARM_SOFTFP_ABI
-#define OLD_ALPHA r3
-//#define OLD_A	
-#else //hard
 #define	OLD_A	r3
 #define OLD_ALPHA s0
-#endif

 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define M	[fp, #-256 ]
 #define N	[fp, #-260 ]
 #define K	[fp, #-264 ]
-
-#ifndef ARM_SOFTFP_ABI
 #define A	[fp, #-268 ]
-#endif

 #define FP_ZERO [fp, #-240]
 #define FP_ZERO_0 [fp, #-240]
@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA   [fp, #-280]

-#ifdef ARM_SOFTFP_ABI
-#define A	[fp, #4 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	r3
+#define OLD_A_SOFTFP	[fp, #4 ]
 #define B	[fp, #8 ]
 #define C	[fp, #12 ]
 #define OLD_LDC	[fp, #16 ]
-#else //hard
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #endif
-	
+
 #define I	r0
 #define J	r1
 #define L	r2
@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
-	
-#ifdef ARM_SOFTFP_ABI
-	str	OLD_ALPHA, ALPHA
-#else //hard
 	str	OLD_A, A
 	vstr	OLD_ALPHA, ALPHA
-#endif
+
 	sub	r3, fp, #128
 	vstm	r3, { s8 - s31} 				// store floating point registers

--- a/kernel/arm/strmm_kernel_4x2_vfp.S
+++ b/kernel/arm/strmm_kernel_4x2_vfp.S
@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-276 ]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	r3
+#define OLD_A_SOFTFP	[fp, #4 ]
+#define B	[fp, #8 ]
+#define OLD_C	[fp, #12 ]
+#define OLD_LDC	[fp, #16 ]
+#define OFFSET  [fp, #20 ]
+#else
 #define B	[fp, #4 ]
 #define OLD_C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif

 #define I	r0
 #define J	r1
@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/strmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S
@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	r3
+#define OLD_A_SOFTFP	[fp, #4 ]
+#define B	[fp, #8 ]
+#define C	[fp, #12 ]
+#define OLD_LDC	[fp, #16 ]
+#define OFFSET  [fp, #20 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif

 #define I	r0
 #define J	r1
@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/swap_vfp.S
+++ b/kernel/arm/swap_vfp.S
@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(COMPLEX)
+
+#if !defined(DOUBLE)
+#define OLD_X		[fp, #0 ]
+#define	OLD_INC_X	[fp, #4 ]
+#define	OLD_Y		[fp, #8 ]
+#define	OLD_INC_Y	[fp, #12 ]
+#else
+#define OLD_X		[fp, #8 ]
+#define	OLD_INC_X	[fp, #12]
+#define	OLD_Y		[fp, #16]
+#define	OLD_INC_Y	[fp, #20]
+#endif
+
+#else //COMPLEX
+
+#if !defined(DOUBLE)
+#define OLD_X		[fp, #4 ]
+#define	OLD_INC_X	[fp, #8 ]
+#define	OLD_Y		[fp, #12 ]
+#define	OLD_INC_Y	[fp, #16 ]
+#else
+#define OLD_X		[fp, #16]
+#define	OLD_INC_X	[fp, #20]
+#define	OLD_Y		[fp, #24]
+#define	OLD_INC_Y	[fp, #28]
+#endif
+
+#endif // !defined(__ARM_PCS_VFP)
+
+#else
 #define	OLD_INC_X	[fp, #0 ]
 #define	OLD_Y		[fp, #4 ]
 #define	OLD_INC_Y	[fp, #8 ]
+#endif


 #define	N	r0
@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	push    {r4 , fp}
        add     fp, sp, #8

+#if !defined(__ARM_PCS_VFP)
+	ldr	X, OLD_X
+#endif
 	ldr    INC_X , OLD_INC_X
 	ldr         Y, OLD_Y
 	ldr    INC_Y , OLD_INC_Y
--- a/kernel/arm/zdot_vfp.S
+++ b/kernel/arm/zdot_vfp.S
@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
-#define	OLD_Y	r3
-

 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * registers
 *******************************************************/

-#define OLD_INC_Y	[fp, #4 ]
+#if !defined(__ARM_PCS_VFP)
+#define	OLD_RETURN_ADDR	r0
+#define	OLD_N		r1
+#define	OLD_X		r2
+#define	OLD_INC_X	r3
+#define	OLD_Y		[fp, #0 ]
+#define	OLD_INC_Y	[fp, #4 ]
+#define	RETURN_ADDR	r8
+#else
+#define	OLD_Y		r3
+#define	OLD_INC_Y	[fp, #0 ]
+#endif

 #define I	r5
 #define Y	r6
@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.align 5

 	push	{r4 - r9, fp}
-	add	fp, sp, #24
+	add	fp, sp, #28
 	sub	sp, sp, #STACKSIZE				// reserve stack

 	sub	r4, fp, #128
@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        vcvt.f64.f32    d2, s0
        vcvt.f64.f32    d3, s0

+#if !defined(__ARM_PCS_VFP)
+	mov	RETURN_ADDR, OLD_RETURN_ADDR
+	mov	N, OLD_N
+	mov	X, OLD_X
+	mov	INC_X, OLD_INC_X
+	ldr	Y, OLD_Y
+	ldr	INC_Y, OLD_INC_Y
+#else
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
-
+#endif

 	cmp	N, #0
 	ble	zdot_kernel_L999
@ -280,8 +297,11 @@ zdot_kernel_L999:
 	vadd.f64	d0 , d0, d2
 	vsub.f64	d1 , d1, d3
 #endif
+#if !defined(__ARM_PCS_VFP)
+	vstm	RETURN_ADDR, {d0 - d1}
+#endif

-	sub	sp, fp, #24
+	sub	sp, fp, #28
 	pop	{r4 - r9, fp}
 	bx	lr

--- a/kernel/arm/zgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfp.S
@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	[fp, #4]
+#define OLD_ALPHAI_SOFTFP	[fp, #12]
+#define OLD_A_SOFTFP	[fp, #20 ]
+#define B	[fp, #24 ]
+#define C	[fp, #28 ]
+#define OLD_LDC	[fp, #32 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif

 #define I	r0
 #define J	r1
@ -87,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)

-	#define	KMAC_R	fnmacd
+	#define	KMAC_R	vmls.f64
 	#define	KMAC_I	fmacd

 	#define	FMAC_R1	fmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd

 #elif defined(CN) || defined(CT)

 	#define	KMAC_R	fmacd
-	#define	KMAC_I	fnmacd
+	#define	KMAC_I	vmls.f64

 	#define	FMAC_R1	fmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd

 #elif defined(NC) || defined(TC)

 	#define	KMAC_R	fmacd
-	#define	KMAC_I	fnmacd
+	#define	KMAC_I	vmls.f64

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
+	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd

 #else

-	#define	KMAC_R  fnmacd
+	#define	KMAC_R  vmls.f64
 	#define	KMAC_I	fmacd

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
+	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd

 #endif
@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	[fp, #4]
+#define OLD_ALPHAI_SOFTFP	[fp, #12]
+#define OLD_A_SOFTFP	[fp, #20 ]
+#define B	[fp, #24 ]
+#define C	[fp, #28 ]
+#define OLD_LDC	[fp, #32 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif

 #define I	r0
 #define J	r1
@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubd
 	#define	FADD_I	faddd

-	#define	FMAC_R1	fnmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R1	vmls.f64
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
-	#define	FMAC_I2	fnmacd
+	#define	FMAC_I2	vmls.f64

 #elif defined(CN) || defined(CT)

@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
+	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd

 #elif defined(NC) || defined(TC)
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubd

 	#define	FMAC_R1	fmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd

@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubd
 	#define	FADD_I	faddd

-	#define	FMAC_R1	fnmacd
+	#define	FMAC_R1	vmls.f64
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
-	#define	FMAC_I2	fnmacd
+	#define	FMAC_I1	vmls.f64
+	#define	FMAC_I2	vmls.f64

 #endif

@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/zgemv_n_vfp.S
+++ b/kernel/arm/zgemv_n_vfp.S
@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR	[fp, #0 ]
+#define OLD_ALPHAI	[fp, #8 ]
+#define OLD_A_SOFTFP	[fp, #16]
+#define OLD_LDA		[fp, #20]
+#define X		[fp, #24]
+#define OLD_INC_X	[fp, #28]
+#define Y		[fp, #32]
+#define OLD_INC_Y	[fp, #36]
+#else
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+#endif
+
 #define OLD_A		r3
 #define	OLD_M		r0

@ -79,42 +91,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if !defined(CONJ) && !defined(XCONJ)

-        #define KMAC_R  fnmacd
+        #define KMAC_R  vmls.f64
        #define KMAC_I  fmacd

        #define FMAC_R1 fmacd
-        #define FMAC_R2 fnmacd
+        #define FMAC_R2 vmls.f64
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd

 #elif defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fmacd
-        #define KMAC_I  fnmacd
+        #define KMAC_I  vmls.f64

        #define FMAC_R1 fmacd
-        #define FMAC_R2 fnmacd
+        #define FMAC_R2 vmls.f64
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd

 #elif !defined(CONJ) && defined(XCONJ)

        #define KMAC_R  fmacd
-        #define KMAC_I  fnmacd
+        #define KMAC_I  vmls.f64

        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
-        #define FMAC_I1 fnmacd
+        #define FMAC_I1 vmls.f64
        #define FMAC_I2 fmacd

 #else

-        #define KMAC_R  fnmacd
+        #define KMAC_R  vmls.f64
        #define KMAC_I  fmacd

        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
-        #define FMAC_I1 fnmacd
+        #define FMAC_I1 vmls.f64
        #define FMAC_I2 fmacd

 #endif
@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	zgemvn_kernel_L999

+#if !defined(__ARM_PCS_VFP)
+	vldr	d0, OLD_ALPHAR
+	vldr	d1, OLD_ALPHAI
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_M, M
 	vstr    d0 , ALPHA_R
--- a/kernel/arm/zgemv_t_vfp.S
+++ b/kernel/arm/zgemv_t_vfp.S
@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR	[fp, #0 ]
+#define OLD_ALPHAI	[fp, #8 ]
+#define OLD_A_SOFTFP	[fp, #16]
+#define OLD_LDA		[fp, #20]
+#define X		[fp, #24]
+#define OLD_INC_X	[fp, #28]
+#define Y		[fp, #32]
+#define OLD_INC_Y	[fp, #36]
+#else
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+#endif
+
 #define OLD_A		r3
 #define	OLD_N		r1

@ -77,42 +89,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if !defined(CONJ) && !defined(XCONJ)

-        #define KMAC_R  fnmacd
+        #define KMAC_R  vmls.f64
        #define KMAC_I  fmacd

        #define FMAC_R1 fmacd
-        #define FMAC_R2 fnmacd
+        #define FMAC_R2 vmls.f64
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd

 #elif defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fmacd
-        #define KMAC_I  fnmacd
+        #define KMAC_I  vmls.f64

        #define FMAC_R1 fmacd
-        #define FMAC_R2 fnmacd
+        #define FMAC_R2 vmls.f64
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd

 #elif !defined(CONJ) && defined(XCONJ)

        #define KMAC_R  fmacd
-        #define KMAC_I  fnmacd
+        #define KMAC_I  vmls.f64

        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
-        #define FMAC_I1 fnmacd
+        #define FMAC_I1 vmls.f64
        #define FMAC_I2 fmacd

 #else

-        #define KMAC_R  fnmacd
+        #define KMAC_R  vmls.f64
        #define KMAC_I  fmacd

        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
-        #define FMAC_I1 fnmacd
+        #define FMAC_I1 vmls.f64
        #define FMAC_I2 fmacd

 #endif
@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	zgemvt_kernel_L999

+#if !defined(__ARM_PCS_VFP)
+	vldr	d0, OLD_ALPHAR
+	vldr	d1, OLD_ALPHAI
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_N, N

--- a/kernel/arm/ztrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S
@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	[fp, #4]
+#define OLD_ALPHAI_SOFTFP	[fp, #12]
+#define OLD_A_SOFTFP	[fp, #20 ]
+#define B	[fp, #24 ]
+#define C	[fp, #28 ]
+#define OLD_LDC	[fp, #32 ]
+#define OFFSET  [fp, #36 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif

 #define I	r0
 #define J	r1
@ -96,42 +106,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)

-	#define	KMAC_R	fnmacd
+	#define	KMAC_R	vmls.f64
 	#define	KMAC_I	fmacd

 	#define	FMAC_R1	fmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd

 #elif defined(CN) || defined(CT)

 	#define	KMAC_R	fmacd
-	#define	KMAC_I	fnmacd
+	#define	KMAC_I	vmls.f64

 	#define	FMAC_R1	fmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd

 #elif defined(NC) || defined(TC)

 	#define	KMAC_R	fmacd
-	#define	KMAC_I	fnmacd
+	#define	KMAC_I	vmls.f64

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
+	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd

 #else

-	#define	KMAC_R  fnmacd
+	#define	KMAC_R  vmls.f64
 	#define	KMAC_I	fmacd

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
+	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd

 #endif
@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	[fp, #4]
+#define OLD_ALPHAI_SOFTFP	[fp, #12]
+#define OLD_A_SOFTFP	[fp, #20 ]
+#define B	[fp, #24 ]
+#define C	[fp, #28 ]
+#define OLD_LDC	[fp, #32 ]
+#define OFFSET  [fp, #36 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif

 #define I	r0
 #define J	r1
@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubd
 	#define	FADD_I	faddd

-	#define	FMAC_R1	fnmuld
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R1	vnmul.f64
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmuld
-	#define	FMAC_I2	fnmacd
+	#define	FMAC_I2	vmls.f64

 #elif defined(CN) || defined(CT)

@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	#define	FMAC_R1	fmuld
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmuld
+	#define	FMAC_I1	vnmul.f64
 	#define	FMAC_I2	fmacd

 #elif defined(NC) || defined(TC)
@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubd

 	#define	FMAC_R1	fmuld
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmuld
 	#define	FMAC_I2	fmacd

@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubd
 	#define	FADD_I	faddd

-	#define	FMAC_R1	fnmuld
+	#define	FMAC_R1	vnmul.f64
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmuld
-	#define	FMAC_I2	fnmacd
+	#define	FMAC_I1	vnmul.f64
+	#define	FMAC_I2	vmls.f64

 #endif

@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x)
       "xxlxor		38, 38,	38	\n\t"
       "xxlxor		39, 39,	39	\n\t"

-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x)
       "xvabssp		50, 42		\n\t"
       "xvabssp		51, 43		\n\t"

-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"

       "xvabssp		%x3, 44		\n\t"
       "xvabssp		%x4, 45		\n\t"

-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"

       "xvabssp		%x5, 46		\n\t"
       "xvabssp		%x6, 47		\n\t"

-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"

       "xvaddsp		32, 32, 48	\n\t"
       "xvaddsp		33, 33, 49	\n\t"

-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"

       "xvaddsp		34, 34, 50	\n\t"
       "xvaddsp		35, 35, 51	\n\t"
--- a/kernel/power/ccopy_microk_power8.c
+++ b/kernel/power/ccopy_microk_power8.c
@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
 {
  __asm__
    (
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %5, %2	\n\t"
-       "lxvw4x		34, %6, %2	\n\t"
-       "lxvw4x		35, %7, %2	\n\t"
-       "lxvw4x		36, %8, %2	\n\t"
-       "lxvw4x		37, %9, %2	\n\t"
-       "lxvw4x		38, %10, %2	\n\t"
-       "lxvw4x		39, %11, %2	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %5, %2	\n\t"
+       "lxvd2x		34, %6, %2	\n\t"
+       "lxvd2x		35, %7, %2	\n\t"
+       "lxvd2x		36, %8, %2	\n\t"
+       "lxvd2x		37, %9, %2	\n\t"
+       "lxvd2x		38, %10, %2	\n\t"
+       "lxvd2x		39, %11, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
       ".p2align	5		\n"
     "1:				\n\t"

-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %5, %2	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "lxvw4x		34, %6, %2	\n\t"
-       "lxvw4x		35, %7, %2	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "lxvw4x		36, %8, %2	\n\t"
-       "lxvw4x		37, %9, %2	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
-       "lxvw4x		38, %10, %2	\n\t"
-       "lxvw4x		39, %11, %2	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %5, %2	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "lxvd2x		34, %6, %2	\n\t"
+       "lxvd2x		35, %7, %2	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "lxvd2x		36, %8, %2	\n\t"
+       "lxvd2x		37, %9, %2	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
+       "lxvd2x		38, %10, %2	\n\t"
+       "lxvd2x		39, %11, %2	\n\t"

       "addi		%3, %3, 128	\n\t"
       "addi		%2, %2, 128	\n\t"

-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"

       "addi		%3, %3, 128	\n\t"
       "addi		%2, %2, 128	\n\t"
@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)

     "2:				\n\t"

-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"

       "addi		%3, %3, 128	\n\t"

-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n"

     "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
     :
--- a/kernel/power/cswap_microk_power8.c
+++ b/kernel/power/cswap_microk_power8.c
@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y)
       ".p2align	5		\n"
     "1:				\n\t"

-       "lxvw4x		32, 0, %4	\n\t"
-       "lxvw4x		33, %5, %4	\n\t"
-       "lxvw4x		34, %6, %4	\n\t"
-       "lxvw4x		35, %7, %4	\n\t"
-       "lxvw4x		36, %8, %4	\n\t"
-       "lxvw4x		37, %9, %4	\n\t"
-       "lxvw4x		38, %10, %4	\n\t"
-       "lxvw4x		39, %11, %4	\n\t"
+       "lxvd2x		32, 0, %4	\n\t"
+       "lxvd2x		33, %5, %4	\n\t"
+       "lxvd2x		34, %6, %4	\n\t"
+       "lxvd2x		35, %7, %4	\n\t"
+       "lxvd2x		36, %8, %4	\n\t"
+       "lxvd2x		37, %9, %4	\n\t"
+       "lxvd2x		38, %10, %4	\n\t"
+       "lxvd2x		39, %11, %4	\n\t"

       "addi		%4, %4, 128	\n\t"

-       "lxvw4x		40, 0, %4	\n\t"
-       "lxvw4x		41, %5, %4	\n\t"
-       "lxvw4x		42, %6, %4	\n\t"
-       "lxvw4x		43, %7, %4	\n\t"
-       "lxvw4x		44, %8, %4	\n\t"
-       "lxvw4x		45, %9, %4	\n\t"
-       "lxvw4x		46, %10, %4	\n\t"
-       "lxvw4x		47, %11, %4	\n\t"
+       "lxvd2x		40, 0, %4	\n\t"
+       "lxvd2x		41, %5, %4	\n\t"
+       "lxvd2x		42, %6, %4	\n\t"
+       "lxvd2x		43, %7, %4	\n\t"
+       "lxvd2x		44, %8, %4	\n\t"
+       "lxvd2x		45, %9, %4	\n\t"
+       "lxvd2x		46, %10, %4	\n\t"
+       "lxvd2x		47, %11, %4	\n\t"

       "addi		%4, %4, -128	\n\t"

-       "lxvw4x		48, 0, %3	\n\t"
-       "lxvw4x		49, %5, %3	\n\t"
-       "lxvw4x		50, %6, %3	\n\t"
-       "lxvw4x		51, %7, %3	\n\t"
-       "lxvw4x		0, %8, %3	\n\t"
-       "lxvw4x		1, %9, %3	\n\t"
-       "lxvw4x		2, %10, %3	\n\t"
-       "lxvw4x		3, %11, %3	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
+       "lxvd2x		49, %5, %3	\n\t"
+       "lxvd2x		50, %6, %3	\n\t"
+       "lxvd2x		51, %7, %3	\n\t"
+       "lxvd2x		0, %8, %3	\n\t"
+       "lxvd2x		1, %9, %3	\n\t"
+       "lxvd2x		2, %10, %3	\n\t"
+       "lxvd2x		3, %11, %3	\n\t"

       "addi		%3, %3, 128	\n\t"

-       "lxvw4x		4, 0, %3	\n\t"
-       "lxvw4x		5, %5, %3	\n\t"
-       "lxvw4x		6, %6, %3	\n\t"
-       "lxvw4x		7, %7, %3	\n\t"
-       "lxvw4x		8, %8, %3	\n\t"
-       "lxvw4x		9, %9, %3	\n\t"
-       "lxvw4x		10, %10, %3	\n\t"
-       "lxvw4x		11, %11, %3	\n\t"
+       "lxvd2x		4, 0, %3	\n\t"
+       "lxvd2x		5, %5, %3	\n\t"
+       "lxvd2x		6, %6, %3	\n\t"
+       "lxvd2x		7, %7, %3	\n\t"
+       "lxvd2x		8, %8, %3	\n\t"
+       "lxvd2x		9, %9, %3	\n\t"
+       "lxvd2x		10, %10, %3	\n\t"
+       "lxvd2x		11, %11, %3	\n\t"

       "addi		%3, %3, -128	\n\t"

-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"

       "addi		%3, %3, 128	\n\t"

-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"

       "addi		%3, %3, 128	\n\t"

-       "stxvw4x		48, 0, %4	\n\t"
-       "stxvw4x		49, %5, %4	\n\t"
-       "stxvw4x		50, %6, %4	\n\t"
-       "stxvw4x		51, %7, %4	\n\t"
-       "stxvw4x		0, %8, %4	\n\t"
-       "stxvw4x		1, %9, %4	\n\t"
-       "stxvw4x		2, %10, %4	\n\t"
-       "stxvw4x		3, %11, %4	\n\t"
+       "stxvd2x		48, 0, %4	\n\t"
+       "stxvd2x		49, %5, %4	\n\t"
+       "stxvd2x		50, %6, %4	\n\t"
+       "stxvd2x		51, %7, %4	\n\t"
+       "stxvd2x		0, %8, %4	\n\t"
+       "stxvd2x		1, %9, %4	\n\t"
+       "stxvd2x		2, %10, %4	\n\t"
+       "stxvd2x		3, %11, %4	\n\t"

       "addi		%4, %4, 128	\n\t"

-       "stxvw4x		4, 0, %4	\n\t"
-       "stxvw4x		5, %5, %4	\n\t"
-       "stxvw4x		6, %6, %4	\n\t"
-       "stxvw4x		7, %7, %4	\n\t"
-       "stxvw4x		8, %8, %4	\n\t"
-       "stxvw4x		9, %9, %4	\n\t"
-       "stxvw4x		10, %10, %4	\n\t"
-       "stxvw4x		11, %11, %4	\n\t"
+       "stxvd2x		4, 0, %4	\n\t"
+       "stxvd2x		5, %5, %4	\n\t"
+       "stxvd2x		6, %6, %4	\n\t"
+       "stxvd2x		7, %7, %4	\n\t"
+       "stxvd2x		8, %8, %4	\n\t"
+       "stxvd2x		9, %9, %4	\n\t"
+       "stxvd2x		10, %10, %4	\n\t"
+       "stxvd2x		11, %11, %4	\n\t"

       "addi		%4, %4, 128	\n\t"

--- a/kernel/power/sasum_microk_power8.c
+++ b/kernel/power/sasum_microk_power8.c
@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
       "xxlxor		38, 38,	38	\n\t"
       "xxlxor		39, 39,	39	\n\t"

-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
       "xvabssp		50, 42		\n\t"
       "xvabssp		51, 43		\n\t"

-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"

       "xvabssp		%x3, 44		\n\t"
       "xvabssp		%x4, 45		\n\t"

-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"

       "xvabssp		%x5, 46		\n\t"
       "xvabssp		%x6, 47		\n\t"

-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"

       "xvaddsp		32, 32, 48	\n\t"
       "xvaddsp		33, 33, 49	\n\t"

-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"

       "xvaddsp		34, 34, 50	\n\t"
       "xvaddsp		35, 35, 51	\n\t"
--- a/kernel/power/scopy_microk_power8.c
+++ b/kernel/power/scopy_microk_power8.c
@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
 {
  __asm__
    (
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
       ".p2align	5		\n"
     "1:				\n\t"

-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"

       "addi		%3, %3, 128	\n\t"
       "addi		%2, %2, 128	\n\t"
@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)

     "2:				\n\t"

-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n"

     "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
     :
--- a/kernel/power/sdot_microk_power8.c
+++ b/kernel/power/sdot_microk_power8.c
@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y)
       "xxlxor		38, 38,	38	\n\t"
       "xxlxor		39, 39,	39	\n\t"

-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		48, 0, %3	\n\t"
-       "lxvw4x		41, %10, %2	\n\t"
-       "lxvw4x		49, %10, %3	\n\t"
-       "lxvw4x		42, %11, %2	\n\t"
-       "lxvw4x		50, %11, %3	\n\t"
-       "lxvw4x		43, %12, %2	\n\t"
-       "lxvw4x		51, %12, %3	\n\t"
-       "lxvw4x		44, %13, %2	\n\t"
-       "lxvw4x		%x4, %13, %3	\n\t"
-       "lxvw4x		45, %14, %2	\n\t"
-       "lxvw4x		%x5, %14, %3	\n\t"
-       "lxvw4x		46, %15, %2	\n\t"
-       "lxvw4x		%x6, %15, %3	\n\t"
-       "lxvw4x		47, %16, %2	\n\t"
-       "lxvw4x		%x7, %16, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
+       "lxvd2x		41, %10, %2	\n\t"
+       "lxvd2x		49, %10, %3	\n\t"
+       "lxvd2x		42, %11, %2	\n\t"
+       "lxvd2x		50, %11, %3	\n\t"
+       "lxvd2x		43, %12, %2	\n\t"
+       "lxvd2x		51, %12, %3	\n\t"
+       "lxvd2x		44, %13, %2	\n\t"
+       "lxvd2x		%x4, %13, %3	\n\t"
+       "lxvd2x		45, %14, %2	\n\t"
+       "lxvd2x		%x5, %14, %3	\n\t"
+       "lxvd2x		46, %15, %2	\n\t"
+       "lxvd2x		%x6, %15, %3	\n\t"
+       "lxvd2x		47, %16, %2	\n\t"
+       "lxvd2x		%x7, %16, %3	\n\t"

       "addi		%2, %2, 128	\n\t"
       "addi		%3, %3, 128	\n\t"
@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y)
     "1:				\n\t"

       "xvmaddasp	32, 40, 48	\n\t"
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		48, 0, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
       "xvmaddasp	33, 41, 49	\n\t"
-       "lxvw4x		41, %10, %2	\n\t"
-       "lxvw4x		49, %10, %3	\n\t"
+       "lxvd2x		41, %10, %2	\n\t"
+       "lxvd2x		49, %10, %3	\n\t"
       "xvmaddasp	34, 42, 50	\n\t"
-       "lxvw4x		42, %11, %2	\n\t"
-       "lxvw4x		50, %11, %3	\n\t"
+       "lxvd2x		42, %11, %2	\n\t"
+       "lxvd2x		50, %11, %3	\n\t"
       "xvmaddasp	35, 43, 51	\n\t"
-       "lxvw4x		43, %12, %2	\n\t"
-       "lxvw4x		51, %12, %3	\n\t"
+       "lxvd2x		43, %12, %2	\n\t"
+       "lxvd2x		51, %12, %3	\n\t"
       "xvmaddasp	36, 44, %x4	\n\t"
-       "lxvw4x		44, %13, %2	\n\t"
-       "lxvw4x		%x4, %13, %3	\n\t"
+       "lxvd2x		44, %13, %2	\n\t"
+       "lxvd2x		%x4, %13, %3	\n\t"
       "xvmaddasp	37, 45, %x5	\n\t"
-       "lxvw4x		45, %14, %2	\n\t"
-       "lxvw4x		%x5, %14, %3	\n\t"
+       "lxvd2x		45, %14, %2	\n\t"
+       "lxvd2x		%x5, %14, %3	\n\t"
       "xvmaddasp	38, 46, %x6	\n\t"
-       "lxvw4x		46, %15, %2	\n\t"
-       "lxvw4x		%x6, %15, %3	\n\t"
+       "lxvd2x		46, %15, %2	\n\t"
+       "lxvd2x		%x6, %15, %3	\n\t"
       "xvmaddasp	39, 47, %x7	\n\t"
-       "lxvw4x		47, %16, %2	\n\t"
-       "lxvw4x		%x7, %16, %3	\n\t"
+       "lxvd2x		47, %16, %2	\n\t"
+       "lxvd2x		%x7, %16, %3	\n\t"

       "addi		%2, %2, 128	\n\t"
       "addi		%3, %3, 128	\n\t"
--- a/kernel/power/srot_microk_power8.c
+++ b/kernel/power/srot_microk_power8.c
@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
       "xscvdpspn	37, %x14	\n\t"	// load s to all words
       "xxspltw		37, 37, 0	\n\t"

-       "lxvw4x		32, 0, %3	\n\t"	// load x
-       "lxvw4x		33, %15, %3	\n\t"
-       "lxvw4x		34, %16, %3	\n\t"
-       "lxvw4x		35, %17, %3	\n\t"
+       "lxvd2x		32, 0, %3	\n\t"	// load x
+       "lxvd2x		33, %15, %3	\n\t"
+       "lxvd2x		34, %16, %3	\n\t"
+       "lxvd2x		35, %17, %3	\n\t"

-       "lxvw4x		48, 0, %4	\n\t"	// load y
-       "lxvw4x		49, %15, %4	\n\t"
-       "lxvw4x		50, %16, %4	\n\t"
-       "lxvw4x		51, %17, %4	\n\t"
+       "lxvd2x		48, 0, %4	\n\t"	// load y
+       "lxvd2x		49, %15, %4	\n\t"
+       "lxvd2x		50, %16, %4	\n\t"
+       "lxvd2x		51, %17, %4	\n\t"

       "addi		%3, %3, 64	\n\t"
       "addi		%4, %4, 64	\n\t"
@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
       "xvmulsp		44, 32, 37	\n\t"	// s * x
       "xvmulsp		45, 33, 37	\n\t"

-       "lxvw4x		32, 0, %3	\n\t"	// load x
-       "lxvw4x		33, %15, %3	\n\t"
+       "lxvd2x		32, 0, %3	\n\t"	// load x
+       "lxvd2x		33, %15, %3	\n\t"

       "xvmulsp		46, 34, 37	\n\t"
       "xvmulsp		47, 35, 37	\n\t"

-       "lxvw4x		34, %16, %3	\n\t"
-       "lxvw4x		35, %17, %3	\n\t"
+       "lxvd2x		34, %16, %3	\n\t"
+       "lxvd2x		35, %17, %3	\n\t"

       "xvmulsp		%x9, 48, 37	\n\t"	// s * y
       "xvmulsp		%x10, 49, 37	\n\t"

-       "lxvw4x		48, 0, %4	\n\t"	// load y
-       "lxvw4x		49, %15, %4	\n\t"
+       "lxvd2x		48, 0, %4	\n\t"	// load y
+       "lxvd2x		49, %15, %4	\n\t"

       "xvmulsp		%x11, 50, 37	\n\t"
       "xvmulsp		%x12, 51, 37	\n\t"

-       "lxvw4x		50, %16, %4	\n\t"
-       "lxvw4x		51, %17, %4	\n\t"
+       "lxvd2x		50, %16, %4	\n\t"
+       "lxvd2x		51, %17, %4	\n\t"

       "xvaddsp		40, 40, %x9	\n\t"	// c * x + s * y
       "xvaddsp		41, 41, %x10	\n\t"	// c * x + s * y
@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
       "xvsubsp		%x7, %x7, 46	\n\t"	// c * y - s * x
       "xvsubsp		%x8, %x8, 47	\n\t"	// c * y - s * x

-       "stxvw4x		40, 0, %3	\n\t"	// store x
-       "stxvw4x		41, %15, %3	\n\t"
-       "stxvw4x		42, %16, %3	\n\t"
-       "stxvw4x		43, %17, %3	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"	// store x
+       "stxvd2x		41, %15, %3	\n\t"
+       "stxvd2x		42, %16, %3	\n\t"
+       "stxvd2x		43, %17, %3	\n\t"

-       "stxvw4x		%x5, 0, %4	\n\t"	// store y
-       "stxvw4x		%x6, %15, %4	\n\t"
-       "stxvw4x		%x7, %16, %4	\n\t"
-       "stxvw4x		%x8, %17, %4	\n\t"
+       "stxvd2x		%x5, 0, %4	\n\t"	// store y
+       "stxvd2x		%x6, %15, %4	\n\t"
+       "stxvd2x		%x7, %16, %4	\n\t"
+       "stxvd2x		%x8, %17, %4	\n\t"

       "addi		%3, %3, 128	\n\t"
       "addi		%4, %4, 128	\n\t"
@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
       "xvsubsp		%x7, %x7, 46	\n\t"	// c * y - s * x
       "xvsubsp		%x8, %x8, 47	\n\t"	// c * y - s * x

-       "stxvw4x		40, 0, %3	\n\t"	// store x
-       "stxvw4x		41, %15, %3	\n\t"
-       "stxvw4x		42, %16, %3	\n\t"
-       "stxvw4x		43, %17, %3	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"	// store x
+       "stxvd2x		41, %15, %3	\n\t"
+       "stxvd2x		42, %16, %3	\n\t"
+       "stxvd2x		43, %17, %3	\n\t"

-       "stxvw4x		%x5, 0, %4	\n\t"	// store y
-       "stxvw4x		%x6, %15, %4	\n\t"
-       "stxvw4x		%x7, %16, %4	\n\t"
-       "stxvw4x		%x8, %17, %4	\n"
+       "stxvd2x		%x5, 0, %4	\n\t"	// store y
+       "stxvd2x		%x6, %15, %4	\n\t"
+       "stxvd2x		%x7, %16, %4	\n\t"
+       "stxvd2x		%x8, %17, %4	\n"

     "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
     "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
--- a/kernel/power/sscal_microk_power8.c
+++ b/kernel/power/sscal_microk_power8.c
@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
       "xscvdpspn	%x3, %x3	\n\t"
       "xxspltw		%x3, %x3, 0	\n\t"

-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %4, %2	\n\t"
-       "lxvw4x		34, %5, %2	\n\t"
-       "lxvw4x		35, %6, %2	\n\t"
-       "lxvw4x		36, %7, %2	\n\t"
-       "lxvw4x		37, %8, %2	\n\t"
-       "lxvw4x		38, %9, %2	\n\t"
-       "lxvw4x		39, %10, %2	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %4, %2	\n\t"
+       "lxvd2x		34, %5, %2	\n\t"
+       "lxvd2x		35, %6, %2	\n\t"
+       "lxvd2x		36, %7, %2	\n\t"
+       "lxvd2x		37, %8, %2	\n\t"
+       "lxvd2x		38, %9, %2	\n\t"
+       "lxvd2x		39, %10, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)

       "xvmulsp		40, 32, %x3	\n\t"
       "xvmulsp		41, 33, %x3	\n\t"
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %4, %2	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %4, %2	\n\t"
       "xvmulsp		42, 34, %x3	\n\t"
       "xvmulsp		43, 35, %x3	\n\t"
-       "lxvw4x		34, %5, %2	\n\t"
-       "lxvw4x		35, %6, %2	\n\t"
+       "lxvd2x		34, %5, %2	\n\t"
+       "lxvd2x		35, %6, %2	\n\t"
       "xvmulsp		44, 36, %x3	\n\t"
       "xvmulsp		45, 37, %x3	\n\t"
-       "lxvw4x		36, %7, %2	\n\t"
-       "lxvw4x		37, %8, %2	\n\t"
+       "lxvd2x		36, %7, %2	\n\t"
+       "lxvd2x		37, %8, %2	\n\t"
       "xvmulsp		46, 38, %x3	\n\t"
       "xvmulsp		47, 39, %x3	\n\t"
-       "lxvw4x		38, %9, %2	\n\t"
-       "lxvw4x		39, %10, %2	\n\t"
+       "lxvd2x		38, %9, %2	\n\t"
+       "lxvd2x		39, %10, %2	\n\t"

       "addi		%2, %2, -128	\n\t"

-       "stxvw4x		40, 0, %2	\n\t"
-       "stxvw4x		41, %4, %2	\n\t"
-       "stxvw4x		42, %5, %2	\n\t"
-       "stxvw4x		43, %6, %2	\n\t"
-       "stxvw4x		44, %7, %2	\n\t"
-       "stxvw4x		45, %8, %2	\n\t"
-       "stxvw4x		46, %9, %2	\n\t"
-       "stxvw4x		47, %10, %2	\n\t"
+       "stxvd2x		40, 0, %2	\n\t"
+       "stxvd2x		41, %4, %2	\n\t"
+       "stxvd2x		42, %5, %2	\n\t"
+       "stxvd2x		43, %6, %2	\n\t"
+       "stxvd2x		44, %7, %2	\n\t"
+       "stxvd2x		45, %8, %2	\n\t"
+       "stxvd2x		46, %9, %2	\n\t"
+       "stxvd2x		47, %10, %2	\n\t"

       "addi		%2, %2, 256	\n\t"

@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
       "xvmulsp		46, 38, %x3	\n\t"
       "xvmulsp		47, 39, %x3	\n\t"

-       "stxvw4x		40, 0, %2	\n\t"
-       "stxvw4x		41, %4, %2	\n\t"
-       "stxvw4x		42, %5, %2	\n\t"
-       "stxvw4x		43, %6, %2	\n\t"
-       "stxvw4x		44, %7, %2	\n\t"
-       "stxvw4x		45, %8, %2	\n\t"
-       "stxvw4x		46, %9, %2	\n\t"
-       "stxvw4x		47, %10, %2	\n"
+       "stxvd2x		40, 0, %2	\n\t"
+       "stxvd2x		41, %4, %2	\n\t"
+       "stxvd2x		42, %5, %2	\n\t"
+       "stxvd2x		43, %6, %2	\n\t"
+       "stxvd2x		44, %7, %2	\n\t"
+       "stxvd2x		45, %8, %2	\n\t"
+       "stxvd2x		46, %9, %2	\n\t"
+       "stxvd2x		47, %10, %2	\n"

     "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
     :
@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
       ".p2align	5		\n"
     "1:				\n\t"

-       "stxvw4x		%x3, 0, %2	\n\t"
-       "stxvw4x		%x3, %4, %2	\n\t"
-       "stxvw4x		%x3, %5, %2	\n\t"
-       "stxvw4x		%x3, %6, %2	\n\t"
-       "stxvw4x		%x3, %7, %2	\n\t"
-       "stxvw4x		%x3, %8, %2	\n\t"
-       "stxvw4x		%x3, %9, %2	\n\t"
-       "stxvw4x		%x3, %10, %2	\n\t"
+       "stxvd2x		%x3, 0, %2	\n\t"
+       "stxvd2x		%x3, %4, %2	\n\t"
+       "stxvd2x		%x3, %5, %2	\n\t"
+       "stxvd2x		%x3, %6, %2	\n\t"
+       "stxvd2x		%x3, %7, %2	\n\t"
+       "stxvd2x		%x3, %8, %2	\n\t"
+       "stxvd2x		%x3, %9, %2	\n\t"
+       "stxvd2x		%x3, %10, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

--- a/kernel/power/sswap_microk_power8.c
+++ b/kernel/power/sswap_microk_power8.c
@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y)
       ".p2align	5		\n"
     "1:				\n\t"

-       "lxvw4x		32, 0, %4	\n\t"
-       "lxvw4x		33, %5, %4	\n\t"
-       "lxvw4x		34, %6, %4	\n\t"
-       "lxvw4x		35, %7, %4	\n\t"
-       "lxvw4x		36, %8, %4	\n\t"
-       "lxvw4x		37, %9, %4	\n\t"
-       "lxvw4x		38, %10, %4	\n\t"
-       "lxvw4x		39, %11, %4	\n\t"
+       "lxvd2x		32, 0, %4	\n\t"
+       "lxvd2x		33, %5, %4	\n\t"
+       "lxvd2x		34, %6, %4	\n\t"
+       "lxvd2x		35, %7, %4	\n\t"
+       "lxvd2x		36, %8, %4	\n\t"
+       "lxvd2x		37, %9, %4	\n\t"
+       "lxvd2x		38, %10, %4	\n\t"
+       "lxvd2x		39, %11, %4	\n\t"

-       "lxvw4x		40, 0, %3	\n\t"
-       "lxvw4x		41, %5, %3	\n\t"
-       "lxvw4x		42, %6, %3	\n\t"
-       "lxvw4x		43, %7, %3	\n\t"
-       "lxvw4x		44, %8, %3	\n\t"
-       "lxvw4x		45, %9, %3	\n\t"
-       "lxvw4x		46, %10, %3	\n\t"
-       "lxvw4x		47, %11, %3	\n\t"
+       "lxvd2x		40, 0, %3	\n\t"
+       "lxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		42, %6, %3	\n\t"
+       "lxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		44, %8, %3	\n\t"
+       "lxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		46, %10, %3	\n\t"
+       "lxvd2x		47, %11, %3	\n\t"

-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"

       "addi		%3, %3, 128	\n\t"

-       "stxvw4x		40, 0, %4	\n\t"
-       "stxvw4x		41, %5, %4	\n\t"
-       "stxvw4x		42, %6, %4	\n\t"
-       "stxvw4x		43, %7, %4	\n\t"
-       "stxvw4x		44, %8, %4	\n\t"
-       "stxvw4x		45, %9, %4	\n\t"
-       "stxvw4x		46, %10, %4	\n\t"
-       "stxvw4x		47, %11, %4	\n\t"
+       "stxvd2x		40, 0, %4	\n\t"
+       "stxvd2x		41, %5, %4	\n\t"
+       "stxvd2x		42, %6, %4	\n\t"
+       "stxvd2x		43, %7, %4	\n\t"
+       "stxvd2x		44, %8, %4	\n\t"
+       "stxvd2x		45, %9, %4	\n\t"
+       "stxvd2x		46, %10, %4	\n\t"
+       "stxvd2x		47, %11, %4	\n\t"

       "addi		%4, %4, 128	\n\t"

--- a/relapack/LICENSE
+++ b/relapack/LICENSE
@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Elmar Peise
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- a/relapack/Makefile
+++ b/relapack/Makefile
@ -0,0 +1,98 @@
+TOPDIR  = ..
+include $(TOPDIR)/Makefile.system
+
+
+
+SRC = $(wildcard src/*.c)
+
+SRC1 = \
+	src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \
+	src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \
+	src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \
+	src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c
+
+SRC2 = \
+	src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
+	src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
+	src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
+	src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
+	src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
+	src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c  src/dtrsyl_rec2.c  src/ztrsyl_rec2.c \
+	src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c  \
+	src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \
+	src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c
+
+SRCX = \
+	src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
+	src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
+	src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
+	src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
+	src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
+	src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c  src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \
+	src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \
+	src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \
+	src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c
+
+OBJS1 = $(SRC1:%.c=%.$(SUFFIX))
+OBJS2 = $(SRC2:%.c=%.o)
+OBJS = $(OBJS1) $(OBJS2)
+
+TEST_SUITS = \
+	slauum dlauum clauum zlauum \
+	spotrf dpotrf cpotrf zpotrf \
+	spbtrf dpbtrf cpbtrf zpbtrf \
+	ssygst dsygst chegst zhegst \
+	ssytrf dsytrf csytrf chetrf zsytrf zhetrf \
+	sgetrf dgetrf cgetrf zgetrf \
+	sgbtrf dgbtrf cgbtrf zgbtrf \
+	strsyl dtrsyl ctrsyl ztrsyl \
+	stgsyl dtgsyl ctgsyl ztgsyl \
+	sgemmt dgemmt cgemmt zgemmt
+TESTS = $(TEST_SUITS:%=test/%.pass)  # dummies
+TEST_EXES = $(TEST_SUITS:%=test/%.x)
+
+LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm
+
+.SECONDARY: $(TEST_EXES)
+.PHONY: test
+
+# ReLAPACK compilation
+
+libs:	$(OBJS)
+	@echo "Building ReLAPACK library $(LIBNAME)"
+	$(AR) -r  $(TOPDIR)/$(LIBNAME) $(OBJS)
+	$(RANLIB) $(TOPDIR)/$(LIBNAME)
+
+%.$(SUFFIX): %.c config.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+%.o: %.c config.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+
+# ReLAPACK testing
+
+test: $(TEST_EXES) $(TESTS)
+	@echo "passed all tests"
+
+test/%.pass: test/%.x
+	@echo -n $*:
+	@./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<)
+
+test/s%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+test/d%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+test/c%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+test/z%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+
+# cleaning up
+
+clean:
+	rm -f  $(OBJS) test/util.$(SUFFIX) test/*.x
--- a/relapack/README.md
+++ b/relapack/README.md
@ -0,0 +1,68 @@
+ReLAPACK
+========
+
+[![Build Status](https://travis-ci.org/HPAC/ReLAPACK.svg?branch=master)](https://travis-ci.org/HPAC/ReLAPACK)
+
+[Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK)
+
+ReLAPACK offers a collection of recursive algorithms for many of LAPACK's
+compute kernels.  Since it preserves LAPACK's established interfaces, ReLAPACK
+integrates effortlessly into existing application codes.  ReLAPACK's routines
+not only outperform the reference LAPACK but also improve upon the performance
+of tuned implementations, such as OpenBLAS and MKL.
+
+
+Coverage
+--------
+For a detailed list of covered operations and an overview of operations to which
+recursion is not efficiently applicable, see [coverage.md](coverage.md).
+
+
+Installation
+------------
+To compile with the default configuration, simply run `make` to create the
+library `librelapack.a`.
+
+### Linking with MKL
+Note that to link with MKL, you currently need to set the flag
+`COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and
+`ztrsyl`.  For further configuration options see [config.md](config.md).
+
+
+### Dependencies
+ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked
+kernels from [LAPACK](http://www.netlib.org/lapack/).  There are many optimized
+and machine specific implementations of these libraries, which are commonly
+provided by hardware vendors or available as open source (e.g.,
+[OpenBLAS](http://www.openblas.net/)).
+
+
+Testing
+-------
+ReLAPACK's test suite compares its routines numerically with LAPACK's
+counterparts.  To set up the tests (located int `test/`) you need to specify
+link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then
+`make test` runs the tests.  For details on the performed tests, see
+[test/README.md](test/README.md).
+
+
+Examples
+--------
+Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the
+covered routines applies directly to ReLAPACK.  A few separate examples are
+given in `examples/`. For details, see [examples/README.md](examples/README.md).
+
+
+Citing
+------
+When referencing ReLAPACK, please cite the preprint of the paper
+[Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763):
+
+    @article{relapack,
+      author    = {Elmar Peise and Paolo Bientinesi},
+      title     = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection},
+      journal   = {CoRR},
+      volume    = {abs/1602.06763},
+      year      = {2016},
+      url       = {http://arxiv.org/abs/1602.06763},
+    }
--- a/relapack/config.h
+++ b/relapack/config.h
@ -0,0 +1,208 @@
+#ifndef RELAPACK_CONFIG_H
+#define RELAPACK_CONFIG_H
+
+// ReLAPACK configuration file.
+// See also config.md
+
+
+///////////////////////////////
+// BLAS/LAPACK obect symbols //
+///////////////////////////////
+
+// BLAS routines linked against have a trailing underscore
+#define BLAS_UNDERSCORE 1
+// LAPACK routines linked against have a trailing underscore
+#define LAPACK_UNDERSCORE BLAS_UNDERSCORE
+
+// Complex BLAS/LAPACK routines return their result in the first argument
+// This option must be enabled when linking to MKL for ctrsyl and ztrsyl to
+// work.
+#define COMPLEX_FUNCTIONS_AS_ROUTINES 0
+#ifdef F_INTERFACE_INTEL
+#define COMPLEX_FUNCTIONS_AS_ROUTINES 1
+#endif
+#define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
+#define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
+
+// The BLAS-like extension xgemmt is provided by an external library.
+#define HAVE_XGEMMT 0
+
+
+////////////////////////////
+// Use malloc in ReLAPACK //
+////////////////////////////
+
+#define ALLOW_MALLOC 1
+// allow malloc in xsygst for improved performance
+#define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
+// allow malloc in xsytrf if the passed work buffer is too small
+#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
+
+
+////////////////////////////////
+// LAPACK routine replacement //
+////////////////////////////////
+// The following macros specify which routines are included in the library under
+// LAPACK's symbol names: 1 included, 0 not included
+
+#define INCLUDE_ALL 1
+
+#define INCLUDE_XLAUUM INCLUDE_ALL
+#define INCLUDE_SLAUUM INCLUDE_XLAUUM
+#define INCLUDE_DLAUUM INCLUDE_XLAUUM
+#define INCLUDE_CLAUUM INCLUDE_XLAUUM
+#define INCLUDE_ZLAUUM INCLUDE_XLAUUM
+
+#define INCLUDE_XSYGST INCLUDE_ALL
+#define INCLUDE_SSYGST INCLUDE_XSYGST
+#define INCLUDE_DSYGST INCLUDE_XSYGST
+#define INCLUDE_CHEGST INCLUDE_XSYGST
+#define INCLUDE_ZHEGST INCLUDE_XSYGST
+
+#define INCLUDE_XTRTRI INCLUDE_ALL
+#define INCLUDE_STRTRI INCLUDE_XTRTRI
+#define INCLUDE_DTRTRI INCLUDE_XTRTRI
+#define INCLUDE_CTRTRI INCLUDE_XTRTRI
+#define INCLUDE_ZTRTRI INCLUDE_XTRTRI
+
+#define INCLUDE_XPOTRF INCLUDE_ALL
+#define INCLUDE_SPOTRF INCLUDE_XPOTRF
+#define INCLUDE_DPOTRF INCLUDE_XPOTRF
+#define INCLUDE_CPOTRF INCLUDE_XPOTRF
+#define INCLUDE_ZPOTRF INCLUDE_XPOTRF
+
+#define INCLUDE_XPBTRF INCLUDE_ALL
+#define INCLUDE_SPBTRF INCLUDE_XPBTRF
+#define INCLUDE_DPBTRF INCLUDE_XPBTRF
+#define INCLUDE_CPBTRF INCLUDE_XPBTRF
+#define INCLUDE_ZPBTRF INCLUDE_XPBTRF
+
+#define INCLUDE_XSYTRF INCLUDE_ALL
+#define INCLUDE_SSYTRF INCLUDE_XSYTRF
+#define INCLUDE_DSYTRF INCLUDE_XSYTRF
+#define INCLUDE_CSYTRF INCLUDE_XSYTRF
+#define INCLUDE_CHETRF INCLUDE_XSYTRF
+#define INCLUDE_ZSYTRF INCLUDE_XSYTRF
+#define INCLUDE_ZHETRF INCLUDE_XSYTRF
+#define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF
+#define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF
+#define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF
+#define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF
+#define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF
+#define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF
+
+#define INCLUDE_XGETRF INCLUDE_ALL
+#define INCLUDE_SGETRF INCLUDE_XGETRF
+#define INCLUDE_DGETRF INCLUDE_XGETRF
+#define INCLUDE_CGETRF INCLUDE_XGETRF
+#define INCLUDE_ZGETRF INCLUDE_XGETRF
+
+#define INCLUDE_XGBTRF INCLUDE_ALL
+#define INCLUDE_SGBTRF INCLUDE_XGBTRF
+#define INCLUDE_DGBTRF INCLUDE_XGBTRF
+#define INCLUDE_CGBTRF INCLUDE_XGBTRF
+#define INCLUDE_ZGBTRF INCLUDE_XGBTRF
+
+#define INCLUDE_XTRSYL INCLUDE_ALL
+#define INCLUDE_STRSYL INCLUDE_XTRSYL
+#define INCLUDE_DTRSYL INCLUDE_XTRSYL
+#define INCLUDE_CTRSYL INCLUDE_XTRSYL
+#define INCLUDE_ZTRSYL INCLUDE_XTRSYL
+
+#define INCLUDE_XTGSYL INCLUDE_ALL
+#define INCLUDE_STGSYL INCLUDE_XTGSYL
+#define INCLUDE_DTGSYL INCLUDE_XTGSYL
+#define INCLUDE_CTGSYL INCLUDE_XTGSYL
+#define INCLUDE_ZTGSYL INCLUDE_XTGSYL
+
+#define INCLUDE_XGEMMT 0
+#define INCLUDE_SGEMMT INCLUDE_XGEMMT
+#define INCLUDE_DGEMMT INCLUDE_XGEMMT
+#define INCLUDE_CGEMMT INCLUDE_XGEMMT
+#define INCLUDE_ZGEMMT INCLUDE_XGEMMT
+
+
+/////////////////////
+// crossover sizes //
+/////////////////////
+
+// default crossover size
+#define CROSSOVER 24
+
+// individual crossover sizes
+#define CROSSOVER_XLAUUM CROSSOVER
+#define CROSSOVER_SLAUUM CROSSOVER_XLAUUM
+#define CROSSOVER_DLAUUM CROSSOVER_XLAUUM
+#define CROSSOVER_CLAUUM CROSSOVER_XLAUUM
+#define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM
+
+#define CROSSOVER_XSYGST CROSSOVER
+#define CROSSOVER_SSYGST CROSSOVER_XSYGST
+#define CROSSOVER_DSYGST CROSSOVER_XSYGST
+#define CROSSOVER_CHEGST CROSSOVER_XSYGST
+#define CROSSOVER_ZHEGST CROSSOVER_XSYGST
+
+#define CROSSOVER_XTRTRI CROSSOVER
+#define CROSSOVER_STRTRI CROSSOVER_XTRTRI
+#define CROSSOVER_DTRTRI CROSSOVER_XTRTRI
+#define CROSSOVER_CTRTRI CROSSOVER_XTRTRI
+#define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI
+
+#define CROSSOVER_XPOTRF CROSSOVER
+#define CROSSOVER_SPOTRF CROSSOVER_XPOTRF
+#define CROSSOVER_DPOTRF CROSSOVER_XPOTRF
+#define CROSSOVER_CPOTRF CROSSOVER_XPOTRF
+#define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF
+
+#define CROSSOVER_XPBTRF CROSSOVER
+#define CROSSOVER_SPBTRF CROSSOVER_XPBTRF
+#define CROSSOVER_DPBTRF CROSSOVER_XPBTRF
+#define CROSSOVER_CPBTRF CROSSOVER_XPBTRF
+#define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF
+
+#define CROSSOVER_XSYTRF CROSSOVER
+#define CROSSOVER_SSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_DSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_CSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_CHETRF CROSSOVER_XSYTRF
+#define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_ZHETRF CROSSOVER_XSYTRF
+#define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF
+#define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF
+#define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF
+#define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF
+#define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF
+#define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF
+
+#define CROSSOVER_XGETRF CROSSOVER
+#define CROSSOVER_SGETRF CROSSOVER_XGETRF
+#define CROSSOVER_DGETRF CROSSOVER_XGETRF
+#define CROSSOVER_CGETRF CROSSOVER_XGETRF
+#define CROSSOVER_ZGETRF CROSSOVER_XGETRF
+
+#define CROSSOVER_XGBTRF CROSSOVER
+#define CROSSOVER_SGBTRF CROSSOVER_XGBTRF
+#define CROSSOVER_DGBTRF CROSSOVER_XGBTRF
+#define CROSSOVER_CGBTRF CROSSOVER_XGBTRF
+#define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF
+
+#define CROSSOVER_XTRSYL CROSSOVER
+#define CROSSOVER_STRSYL CROSSOVER_XTRSYL
+#define CROSSOVER_DTRSYL CROSSOVER_XTRSYL
+#define CROSSOVER_CTRSYL CROSSOVER_XTRSYL
+#define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL
+
+#define CROSSOVER_XTGSYL CROSSOVER
+#define CROSSOVER_STGSYL CROSSOVER_XTGSYL
+#define CROSSOVER_DTGSYL CROSSOVER_XTGSYL
+#define CROSSOVER_CTGSYL CROSSOVER_XTGSYL
+#define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL
+
+// sytrf helper routine
+#define CROSSOVER_XGEMMT CROSSOVER_XSYTRF
+#define CROSSOVER_SGEMMT CROSSOVER_XGEMMT
+#define CROSSOVER_DGEMMT CROSSOVER_XGEMMT
+#define CROSSOVER_CGEMMT CROSSOVER_XGEMMT
+#define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT
+
+#endif /* RELAPACK_CONFIG_H */
--- a/relapack/config.md
+++ b/relapack/config.md
@ -0,0 +1,87 @@
+RELAPACK Configuration
+======================
+
+ReLAPACK has two configuration files: `make.inc`, which is included by the
+Makefile, and `config.h` which is included in the source files.
+
+
+Build and Testing Environment
+-----------------------------
+The build environment (compiler and flags) and the test configuration (linker
+flags for BLAS and LAPACK) are specified in `make.inc`.  The test matrix size
+and error bounds are defined in `test/config.h`.
+
+The library `librelapack.a` is compiled by invoking `make`.  The tests are
+performed by either `make test` or calling `make` in the test folder.
+
+
+BLAS/LAPACK complex function interfaces
+---------------------------------------
+For BLAS and LAPACK functions that return a complex number, there exist two
+conflicting (FORTRAN compiler dependent) calling conventions: either the result
+is returned as a `struct` of two floating point numbers or an additional first
+argument with a pointer to such a `struct` is used.  By default ReLAPACK uses
+the former (which is what gfortran uses), but it can switch to the latter by
+setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK
+specific counterparts) to `1` in `config.h`.
+
+**For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.**
+
+(Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases
+will segfault or return errors on the order of 1 or larger.)
+
+
+BLAS extension `xgemmt`
+-----------------------
+The LDL decompositions require a general matrix-matrix product that updates only
+a triangular matrix called `xgemmt`.  If the BLAS implementation linked against
+provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`;
+otherwise, ReLAPACK uses its own recursive implementation of these kernels.
+
+`xgemmt` is provided by MKL.
+
+
+Routine Selection
+-----------------
+ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`).  If the
+corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to
+`1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g.,
+`dgetrf_`).  By default, wrappers for all routines are enabled.
+
+
+Crossover Size
+--------------
+The crossover size determines below which matrix sizes ReLAPACK's recursive
+algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3
+routines.  The crossover size is set in `config.h` and can be chosen either
+globally for the entire library, by operation, or individually by routine.
+
+
+Allowing Temporary Buffers
+--------------------------
+Two of ReLAPACK's routines make use of temporary buffers, which are allocated
+and freed within ReLAPACK.  Setting `ALLOW_MALLOC` (or one of the routine
+specific counterparts) to 0 in `config.h` will disable these buffers.  The
+affected routines are:
+
+ * `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2.  As in
+   LAPACK, this size can be queried by setting `lWork = -1` and the passed
+   buffer will be used if it is large enough; only if it is not, a local buffer
+   will be allocated.
+
+   The advantage of this mechanism is that ReLAPACK will seamlessly work even
+   with codes that statically provide too little memory instead of breaking
+   them.
+
+ * `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem
+   to standard form can use an auxiliary buffer of size n^2 / 2 to avoid
+   redundant computations.  It thereby performs about 30% less FLOPs than
+   LAPACK.
+
+
+FORTRAN symbol names
+--------------------
+ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces.
+Since these libraries usually have an underscore to their symbol names, ReLAPACK
+has configuration switches in `config.h` to adjust the corresponding routine
+names.
--- a/relapack/coverage.md
+++ b/relapack/coverage.md
@ -0,0 +1,212 @@
+Coverage of ReLAPACK
+====================
+
+This file lists all LAPACK compute routines that are covered by recursive
+algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which
+are not (yet) part of ReLAPACK.
+
+<!-- START doctoc generated TOC please keep comment here to allow auto update -->
+<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
+**Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
+
+- [List of covered LAPACK routines](#list-of-covered-lapack-routines)
+  - [`xlauum`](#xlauum)
+  - [`xsygst`](#xsygst)
+  - [`xtrtri`](#xtrtri)
+  - [`xpotrf`](#xpotrf)
+  - [`xpbtrf`](#xpbtrf)
+  - [`xsytrf`](#xsytrf)
+  - [`xgetrf`](#xgetrf)
+  - [`xgbtrf`](#xgbtrf)
+  - [`xtrsyl`](#xtrsyl)
+  - [`xtgsyl`](#xtgsyl)
+- [Covered BLAS extension](#covered-blas-extension)
+  - [`xgemmt`](#xgemmt)
+- [Not covered yet](#not-covered-yet)
+  - [`xpstrf`](#xpstrf)
+- [Not covered: extra FLOPs](#not-covered-extra-flops)
+  - [QR decomposition (and related)](#qr-decomposition-and-related)
+  - [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal)
+  - [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal)
+  - [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg)
+
+<!-- END doctoc generated TOC please keep comment here to allow auto update -->
+
+
+List of covered LAPACK routines
+-------------------------------
+
+### `xlauum`
+Multiplication of a triangular matrix with its (complex conjugate) transpose,
+resulting in a symmetric (Hermitian) matrix.
+
+Routines: `slauum`, `dlauum`, `clauum`, `zlauum`
+
+Operations:
+* A = L^T L
+* A = U U^T
+
+### `xsygst`
+Simultaneous two-sided multiplication of a symmetric matrix with a triangular
+matrix and its transpose
+
+Routines: `ssygst`, `dsygst`, `chegst`, `zhegst`
+
+Operations:
+* A = inv(L) A inv(L^T)
+* A = inv(U^T) A inv(U)
+* A = L^T A L
+* A = U A U^T
+
+### `xtrtri`
+Inversion of a triangular matrix
+
+Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri`
+
+Operations:
+* L = inv(L)
+* U = inv(U)
+
+### `xpotrf`
+Cholesky decomposition of a symmetric (Hermitian) positive definite matrix
+
+Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf`
+
+Operations:
+* L L^T = A
+* U^T U = A
+
+### `xpbtrf`
+Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix
+
+Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf`
+
+Operations:
+* L L^T = A
+* U^T U = A
+
+### `xsytrf`
+LDL decomposition of a symmetric (or Hermitian) matrix
+
+Routines:
+* `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`,
+* `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`,
+  `zhetrf_rook`
+
+Operations:
+* L D L^T = A
+* U^T D U = A
+
+### `xgetrf`
+LU decomposition of a general matrix with pivoting
+
+Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf`
+
+Operation: P L U = A
+
+### `xgbtrf`
+LU decomposition of a general banded matrix with pivoting
+
+Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf`
+
+Operation: L U = A
+
+### `xtrsyl`
+Solution of the quasi-triangular Sylvester equation
+
+Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl`
+
+Operations:
+* A X + B Y = C -> X
+* A^T X + B Y = C -> X
+* A X + B^T Y = C -> X
+* A^T X + B^T Y = C -> X
+* A X - B Y = C -> X
+* A^T X - B Y = C -> X
+* A X - B^T Y = C -> X
+* A^T X - B^T Y = C -> X
+
+### `xtgsyl`
+Solution of the generalized Sylvester equations
+
+Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl`
+
+Operations:
+* A R - L B = C, D R - L E = F -> L, R
+* A^T R + D^T L = C, R B^T - L E^T = -F -> L, R
+
+
+Covered BLAS extension
+----------------------
+
+### `xgemmt`
+Matrix-matrix product updating only a triangular part of the result
+
+Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt`
+
+Operations:
+* C = alpha A B + beta C
+* C = alpha A B^T + beta C
+* C = alpha A^T B + beta C
+* C = alpha A^T B^T + beta C
+
+
+Not covered yet
+---------------
+The following operation is implemented as a blocked algorithm in LAPACK but
+currently not yet covered in ReLAPACK as a recursive algorithm
+
+### `xpstrf`
+Cholesky decomposition of a positive semi-definite matrix with complete pivoting.
+
+Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf`
+
+Operations:
+* P L L^T P^T = A
+* P U^T U P^T = A
+
+
+Not covered: extra FLOPs
+------------------------
+The following routines are not covered because recursive variants would require
+considerably more FLOPs or operate on banded matrices.
+
+### QR decomposition (and related)
+Routines:
+* `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf`
+* `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf`
+* `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf`
+* `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf`
+* `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf`
+
+Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A
+
+Routines for multiplication with Q:
+* `sormqr`, `dormqr`, `cunmqr`, `zunmqr`
+* `sormrq`, `dormrq`, `cunmrq`, `zunmrq`
+* `sormql`, `dormql`, `cunmql`, `zunmql`
+* `sormlq`, `dormlq`, `cunmlq`, `zunmlq`
+* `sormrz`, `dormrz`, `cunmrz`, `zunmrz`
+
+Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T
+
+Routines for construction of Q:
+* `sorgqr`, `dorgqr`, `cungqr`, `zungqr`
+* `sorgrq`, `dorgrq`, `cungrq`, `zungrq`
+* `sorgql`, `dorgql`, `cungql`, `zungql`
+* `sorglq`, `dorglq`, `cunglq`, `zunglq`
+
+### Symmetric reduction to tridiagonal
+Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd`
+
+Operation: Q T Q^T = A
+
+### Symmetric reduction to bidiagonal
+Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd`
+
+Operation: Q T P^T = A
+
+### Reduction to upper Hessenberg
+Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd`
+
+Operation: Q H Q^T = A
--- a/relapack/inc/relapack.h
+++ b/relapack/inc/relapack.h
@ -0,0 +1,67 @@
+#ifndef RELAPACK_H
+#define RELAPACK_H
+
+void RELAPACK_slauum(const char *, const int *, float *, const int *, int *);
+void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *);
+void RELAPACK_clauum(const char *, const int *, float *, const int *, int *);
+void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *);
+
+void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *);
+void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *);
+void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *);
+void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *);
+
+void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *);
+void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *);
+void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *);
+void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *);
+
+void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *);
+void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *);
+void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *);
+void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *);
+
+void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+
+void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *);
+void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *);
+
+void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+
+void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+
+void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+
+void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
+void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
+void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+
+void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+#endif /*  RELAPACK_H */
--- a/relapack/src/blas.h
+++ b/relapack/src/blas.h
@ -0,0 +1,61 @@
+#ifndef BLAS_H
+#define BLAS_H
+
+extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *);
+extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *);
+extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *);
+extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *);
+
+extern void BLAS(sscal)(const int *, const float *, float *, const int *);
+extern void BLAS(dscal)(const int *, const double *, double *, const int *);
+extern void BLAS(cscal)(const int *, const float *, float *, const int *);
+extern void BLAS(zscal)(const int *, const double *, double *, const int *);
+
+extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *);
+extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *);
+
+extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+
+extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+
+extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+
+extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+
+extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
+extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
+extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
+extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
+
+extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+#if HAVE_XGEMMT
+extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+#endif
+
+#endif /* BLAS_H */
--- a/relapack/src/cgbtrf.c
+++ b/relapack/src/cgbtrf.c
@ -0,0 +1,230 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *,
+    const int *, float *, const int *, int *, float *, const int *, float *,
+    const int *, int *);
+
+
+/** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's cgbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html
+ * */
+void RELAPACK_cgbtrf(
+    const int *m, const int *n, const int *kl, const int *ku,
+    float *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kl < 0)
+        *info = -3;
+    else if (*ku < 0)
+        *info = -4;
+    else if (*ldAb < 2 * *kl + *ku + 1)
+        *info = -6;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CGBTRF", &minfo);
+        return;
+    }
+
+    // Constant
+    const float ZERO[] = { 0., 0. };
+
+    // Result upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    float *const A = Ab + 2 * kv;
+
+    // Zero upper diagonal fill-in elements
+    int i, j;
+    for (j = 0; j < *n; j++) {
+        float *const A_j = A + 2 * *ldA * j;
+        for (i = MAX(0, j - kv); i < j - *ku; i++)
+            A_j[2 * i] = A_j[2 * i + 1] = 0.;
+    }
+
+    // Allocate work space
+    const int n1 = CREC_SPLIT(*n);
+    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+    const int nWorkl = (kv > n1) ? n1 : kv;
+    const int mWorku = (*kl > n1) ? n1 : *kl;
+    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float));
+    float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float));
+    LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
+    LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
+
+    // Recursive kernel
+    RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
+
+    // Free work space
+    free(Workl);
+    free(Worku);
+}
+
+
+/** cgbtrf's recursive compute kernel */
+static void RELAPACK_cgbtrf_rec(
+    const int *m, const int *n, const int *kl, const int *ku,
+    float *Ab, const int *ldAb, int *ipiv,
+    float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_CGBTRF, 1)) {
+        // Unblocked
+        LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    // Loop iterators
+    int i, j;
+
+    // Output upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    float *const A = Ab + 2 * kv;
+
+    // Splitting
+    const int n1  = MIN(CREC_SPLIT(*n), *kl);
+    const int n2  = *n - n1;
+    const int m1  = MIN(n1, *m);
+    const int m2  = *m - m1;
+    const int mn1 = MIN(m1, n1);
+    const int mn2 = MIN(m2, n2);
+
+    // Ab_L *
+    //      Ab_BR
+    float *const Ab_L  = Ab;
+    float *const Ab_BR = Ab + 2 * *ldAb * n1;
+
+    // A_L A_R
+    float *const A_L = A;
+    float *const A_R = A + 2 * *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * m1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * m1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // Banded splitting
+    const int n21 = MIN(n2, kv - n1);
+    const int n22 = MIN(n2 - n21, n1);
+    const int m21 = MIN(m2, *kl - m1);
+    const int m22 = MIN(m2 - m21, m1);
+
+    //   n1 n21  n22
+    // m *  A_Rl ARr
+    float *const A_Rl = A_R;
+    float *const A_Rr = A_R + 2 * *ldA * n21;
+
+    //     n1    n21    n22
+    // m1  *     A_TRl  A_TRr
+    // m21 A_BLt A_BRtl A_BRtr
+    // m22 A_BLb A_BRbl A_BRbr
+    float *const A_TRl  = A_TR;
+    float *const A_TRr  = A_TR + 2 * *ldA * n21;
+    float *const A_BLt  = A_BL;
+    float *const A_BLb  = A_BL                  + 2 * m21;
+    float *const A_BRtl = A_BR;
+    float *const A_BRtr = A_BR + 2 * *ldA * n21;
+    float *const A_BRbl = A_BR                  + 2 * m21;
+    float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;
+
+    // recursion(Ab_L, ipiv_T)
+    RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
+
+    // Workl = A_BLb
+    LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
+
+    // partially redo swaps in A_L
+    for (i = 0; i < mn1; i++) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
+            else
+                BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
+        }
+    }
+
+    // apply pivots to A_Rl
+    LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
+
+    // apply pivots to A_Rr columnwise
+    for (j = 0; j < n22; j++) {
+        float *const A_Rrj = A_Rr + 2 * *ldA * j;
+        for (i = j; i < mn1; i++) {
+            const int ip = ipiv_T[i] - 1;
+            if (ip != i) {
+                const float tmpr = A_Rrj[2 * i];
+                const float tmpc = A_Rrj[2 * i + 1];
+                A_Rrj[2 * i]     = A_Rrj[2 * ip];
+                A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1];
+                A_Rrj[2 * ip]     = tmpr;
+                A_Rrj[2 * ip + 1] = tmpc;
+            }
+        }
+    }
+
+    // A_TRl = A_TL \ A_TRl
+    BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+    // Worku = A_TRr
+    LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
+    // Worku = A_TL \ Worku
+    BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
+    // A_TRr = Worku
+    LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
+    // A_BRtl = A_BRtl - A_BLt * A_TRl
+    BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
+    // A_BRbl = A_BRbl - Workl * A_TRl
+    BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
+    // A_BRtr = A_BRtr - A_BLt * Worku
+    BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
+    // A_BRbr = A_BRbr - Workl * Worku
+    BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
+
+    // partially undo swaps in A_L
+    for (i = mn1 - 1; i >= 0; i--) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
+            else
+                BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
+        }
+    }
+
+    // recursion(Ab_BR, ipiv_B)
+    RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+    if (*info)
+        *info += n1;
+    // shift pivots
+    for (i = 0; i < mn2; i++)
+        ipiv_B[i] += n1;
+}
--- a/relapack/src/cgemmt.c
+++ b/relapack/src/cgemmt.c
@ -0,0 +1,167 @@
+#include "relapack.h"
+
+static void RELAPACK_cgemmt_rec(const char *, const char *, const char *,
+    const int *, const int *, const float *, const float *, const int *,
+    const float *, const int *, const float *, float *, const int *);
+
+static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *,
+    const int *, const int *, const float *, const float *, const int *,
+    const float *, const int *, const float *, float *, const int *);
+
+
+/** CGEMMT computes a matrix-matrix product with general matrices but updates
+ * only the upper or lower triangular part of the result matrix.
+ *
+ * This routine performs the same operation as the BLAS routine
+ * cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
+ * but only updates the triangular part of C specified by uplo:
+ * If (*uplo == 'L'), only the lower triangular part of C is updated,
+ * otherwise the upper triangular part is updated.
+ * */
+void RELAPACK_cgemmt(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+
+#if HAVE_XGEMMT
+    BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+    return;
+#else
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int notransA = LAPACK(lsame)(transA, "N");
+    const int tranA = LAPACK(lsame)(transA, "T");
+    const int ctransA = LAPACK(lsame)(transA, "C");
+    const int notransB = LAPACK(lsame)(transB, "N");
+    const int tranB = LAPACK(lsame)(transB, "T");
+    const int ctransB = LAPACK(lsame)(transB, "C");
+    int info = 0;
+    if (!lower && !upper)
+        info = 1;
+    else if (!tranA && !ctransA && !notransA)
+        info = 2;
+    else if (!tranB && !ctransB && !notransB)
+        info = 3;
+    else if (*n < 0)
+        info = 4;
+    else if (*k < 0)
+        info = 5;
+    else if (*ldA < MAX(1, notransA ? *n : *k))
+        info = 8;
+    else if (*ldB < MAX(1, notransB ? *k : *n))
+        info = 10;
+    else if (*ldC < MAX(1, *n))
+        info = 13;
+    if (info) {
+        LAPACK(xerbla)("CGEMMT", &info);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+    const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
+    const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');
+
+    // Recursive kernel
+    RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+#endif
+}
+
+
+/** cgemmt's recursive compute kernel */
+static void RELAPACK_cgemmt_rec(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+
+    if (*n <= MAX(CROSSOVER_CGEMMT, 1)) {
+        // Unblocked
+        RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+        return;
+    }
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_T
+    // A_B
+    const float *const A_T = A;
+    const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);
+
+    // B_L B_R
+    const float *const B_L = B;
+    const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);
+
+    // C_TL C_TR
+    // C_BL C_BR
+    float *const C_TL = C;
+    float *const C_TR = C + 2 * *ldC * n1;
+    float *const C_BL = C                 + 2 * n1;
+    float *const C_BR = C + 2 * *ldC * n1 + 2 * n1;
+
+    // recursion(C_TL)
+    RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
+
+    if (*uplo == 'L')
+        // C_BL = alpha A_B B_L + beta C_BL
+        BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
+    else
+        // C_TR = alpha A_T B_R + beta C_TR
+        BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
+
+    // recursion(C_BR)
+    RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
+}
+
+
+/** cgemmt's unblocked compute kernel */
+static void RELAPACK_cgemmt_rec2(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+
+    const int incB = (*transB == 'N') ? 1 : *ldB;
+    const int incC = 1;
+
+    int i;
+    for (i = 0; i < *n; i++) {
+        // A_0
+        // A_i
+        const float *const A_0 = A;
+        const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);
+
+        // * B_i *
+        const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);
+
+        // * C_0i *
+        // * C_ii *
+        float *const C_0i = C + 2 * *ldC * i;
+        float *const C_ii = C + 2 * *ldC * i + 2 * i;
+
+        if (*uplo == 'L') {
+            const int nmi = *n - i;
+            if (*transA == 'N')
+                BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+            else
+                BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+        } else {
+            const int ip1 = i + 1;
+            if (*transA == 'N')
+                BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+            else
+                BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+        }
+    }
+}
--- a/relapack/src/cgetrf.c
+++ b/relapack/src/cgetrf.c
@ -0,0 +1,117 @@
+#include "relapack.h"
+
+static void RELAPACK_cgetrf_rec(const int *, const int *, float *,
+    const int *, int *, int *);
+
+
+/** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's cgetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html
+ */
+void RELAPACK_cgetrf(
+    const int *m, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CGETRF", &minfo);
+        return;
+    }
+
+    const int sn = MIN(*m, *n);
+
+    RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info);
+
+    // Right remainder
+    if (*m < *n) {
+        // Constants
+        const float ONE[]  = { 1., 0. };
+        const int   iONE[] = { 1 };
+
+        // Splitting
+        const int rn = *n - *m;
+
+        // A_L A_R
+        const float *const A_L = A;
+        float *const       A_R = A + 2 * *ldA * *m;
+
+        // A_R = apply(ipiv, A_R)
+        LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
+        // A_R = A_L \ A_R
+        BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
+    }
+}
+
+
+/** cgetrf's recursive compute kernel */
+static void RELAPACK_cgetrf_rec(
+    const int *m, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_CGETRF, 1)) {
+        // Unblocked
+        LAPACK(cgetf2)(m, n, A, ldA, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+    const int m2 = *m - n1;
+
+    // A_L A_R
+    float *const A_L = A;
+    float *const A_R = A + 2 * *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // recursion(A_L, ipiv_T)
+    RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
+    // apply pivots to A_R
+    LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
+
+    // A_TR = A_TL \ A_TR
+    BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+    // A_BR = A_BR - A_BL * A_TR
+    BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
+
+    // recursion(A_BR, ipiv_B)
+    RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
+    if (*info)
+        *info += n1;
+    // apply pivots to A_BL
+    LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
+    // shift pivots
+    int i;
+    for (i = 0; i < n2; i++)
+        ipiv_B[i] += n1;
+}
--- a/relapack/src/chegst.c
+++ b/relapack/src/chegst.c
@ -0,0 +1,212 @@
+#include "relapack.h"
+#if XSYGST_ALLOW_MALLOC
+#include "stdlib.h"
+#endif
+
+static void RELAPACK_chegst_rec(const int *, const char *, const int *,
+    float *, const int *, const float *, const int *,
+    float *, const int *, int *);
+
+
+/** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
+ *
+ * This routine is functionally equivalent to LAPACK's chegst.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html
+ * */
+void RELAPACK_chegst(
+    const int *itype, const char *uplo, const int *n,
+    float *A, const int *ldA, const float *B, const int *ldB,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (*itype < 1 || *itype > 3)
+        *info = -1;
+    else if (!lower && !upper)
+        *info = -2;
+    else if (*n < 0)
+        *info = -3;
+    else if (*ldA < MAX(1, *n))
+        *info = -5;
+    else if (*ldB < MAX(1, *n))
+        *info = -7;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CHEGST", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Allocate work space
+    float *Work = NULL;
+    int   lWork = 0;
+#if XSYGST_ALLOW_MALLOC
+    const int n1 = CREC_SPLIT(*n);
+    lWork = n1 * (*n - n1);
+    Work  = malloc(lWork * 2 * sizeof(float));
+    if (!Work)
+        lWork = 0;
+#endif
+
+    // recursive kernel
+    RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
+
+    // Free work space
+#if XSYGST_ALLOW_MALLOC
+    if (Work)
+        free(Work);
+#endif
+}
+
+
+/** chegst's recursive compute kernel */
+static void RELAPACK_chegst_rec(
+    const int *itype, const char *uplo, const int *n,
+    float *A, const int *ldA, const float *B, const int *ldB,
+    float *Work, const int *lWork, int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_CHEGST, 1)) {
+        // Unblocked
+        LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info);
+        return;
+    }
+
+    // Constants
+    const float ZERO[]  = { 0., 0. };
+    const float ONE[]   = { 1., 0. };
+    const float MONE[]  = { -1., 0. };
+    const float HALF[]  = { .5, 0. };
+    const float MHALF[] = { -.5, 0. };
+    const int   iONE[]  = { 1 };
+
+    // Loop iterator
+    int i;
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // B_TL B_TR
+    // B_BL B_BR
+    const float *const B_TL = B;
+    const float *const B_TR = B + 2 * *ldB * n1;
+    const float *const B_BL = B                 + 2 * n1;
+    const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+    // recursion(A_TL, B_TL)
+    RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
+
+    if (*itype == 1)
+        if (*uplo == 'L') {
+            // A_BL = A_BL / B_TL'
+            BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+            if (*lWork > n2 * n1) {
+                // T = -1/2 * B_BL * A_TL
+                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            } else
+                // A_BL = A_BL - 1/2 B_BL * A_TL
+                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
+            BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
+            if (*lWork > n2 * n1)
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            else
+                // A_BL = A_BL - 1/2 B_BL * A_TL
+                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BL = B_BR \ A_BL
+            BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+        } else {
+            // A_TR = B_TL' \ A_TR
+            BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+            if (*lWork > n2 * n1) {
+                // T = -1/2 * A_TL * B_TR
+                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
+                // A_TR = A_BL + T
+                for (i = 0; i < n2; i++)
+                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            } else
+                // A_TR = A_TR - 1/2 A_TL * B_TR
+                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
+            BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
+            if (*lWork > n2 * n1)
+                // A_TR = A_BL + T
+                for (i = 0; i < n2; i++)
+                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            else
+                // A_TR = A_TR - 1/2 A_TL * B_TR
+                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TR = A_TR / B_BR
+            BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+        }
+    else
+        if (*uplo == 'L') {
+            // A_BL = A_BL * B_TL
+            BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+            if (*lWork > n2 * n1) {
+                // T = 1/2 * A_BR * B_BL
+                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            } else
+                // A_BL = A_BL + 1/2 A_BR * B_BL
+                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
+            BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
+            if (*lWork > n2 * n1)
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            else
+                // A_BL = A_BL + 1/2 A_BR * B_BL
+                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BL = B_BR * A_BL
+            BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+        } else {
+            // A_TR = B_TL * A_TR
+            BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+            if (*lWork > n2 * n1) {
+                // T = 1/2 * B_TR * A_BR
+                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
+                // A_TR = A_TR + T
+                for (i = 0; i < n2; i++)
+                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            } else
+                // A_TR = A_TR + 1/2 B_TR A_BR
+                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
+            BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
+            if (*lWork > n2 * n1)
+                // A_TR = A_TR + T
+                for (i = 0; i < n2; i++)
+                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            else
+                // A_TR = A_TR + 1/2 B_TR * A_BR
+                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TR = A_TR * B_BR
+            BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+        }
+
+    // recursion(A_BR, B_BR)
+    RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
+}
--- a/relapack/src/chetrf.c
+++ b/relapack/src/chetrf.c
@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *,
+    float *, const int *, int *, float *, const int *, int *);
+
+
+/** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's chetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html
+ * */
+void RELAPACK_chetrf(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CHETRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** chetrf's recursive compute kernel */
+static void RELAPACK_chetrf_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = CREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        float *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        float *const A_BL   = A                 + 2 * n1;
+        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        float *const A_BL_B = A                 + 2 * *n;
+        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        float *const Work_BL =              Work                    + 2 * n1;
+        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            float *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = CREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        float *const A_TL_T = A + 2 * *ldA * n_rest;
+        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        float *const Work_L  = Work;
+        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
--- a/relapack/src/chetrf_rec2.c
+++ b/relapack/src/chetrf_rec2.c
@ -0,0 +1,520 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
+ *
+ * This routine is a minor modification of LAPACK's clahef.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int *
+	nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
+	int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    float r__1, r__2, r__3, r__4;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    double sqrt(double), r_imag(complex *);
+    void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
+
+    /* Local variables */
+    static int j, k;
+    static float t, r1;
+    static complex d11, d21, d22;
+    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static float alpha;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+	    , complex *, int *, complex *, int *, complex *, complex *
+	    , int *, ftnlen), ccopy_(int *, complex *, int *,
+	    complex *, int *), cswap_(int *, complex *, int *,
+	    complex *, int *);
+    static int kstep;
+    static float absakk;
+    extern /* Subroutine */ int clacgv_(int *, complex *, int *);
+    extern int icamax_(int *, complex *, int *);
+    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+	    *);
+    static float colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.f) + 1.f) / 8.f;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	i__1 = k - 1;
+	ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	i__1 = k + kw * w_dim1;
+	i__2 = k + k * a_dim1;
+	r__1 = a[i__2].r;
+	w[i__1].r = r__1, w[i__1].i = 0.f;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	    i__1 = k + kw * w_dim1;
+	    i__2 = k + kw * w_dim1;
+	    r__1 = w[i__2].r;
+	    w[i__1].r = r__1, w[i__1].i = 0.f;
+	}
+	i__1 = k + kw * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + kw * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + k * a_dim1;
+	    r__1 = a[i__2].r;
+	    a[i__1].r = r__1, a[i__1].i = 0.f;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - 1;
+		ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = imax + (kw - 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		r__1 = a[i__2].r;
+		w[i__1].r = r__1, w[i__1].i = 0.f;
+		i__1 = k - imax;
+		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		i__1 = k - imax;
+		clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    i__2 = imax + (kw - 1) * w_dim1;
+		    r__1 = w[i__2].r;
+		    w[i__1].r = r__1, w[i__1].i = 0.f;
+		}
+		i__1 = k - imax;
+		jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+			 &c__1);
+		i__1 = jmax + (kw - 1) * w_dim1;
+		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			jmax + (kw - 1) * w_dim1]), dabs(r__2));
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+			    r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
+			    r__2));
+		    rowmax = dmax(r__3,r__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
+			kp = imax;
+			ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+				w_dim1 + 1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = kk - 1 - kp;
+		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		i__1 = kk - 1 - kp;
+		clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    i__1 = k + k * a_dim1;
+		    r1 = 1.f / a[i__1].r;
+		    i__1 = k - 1;
+		    csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    i__1 = k - 1;
+		    clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		}
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    r_cnjg(&q__2, &d21);
+		    c_div(&q__1, &w[k + kw * w_dim1], &q__2);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1.f / (q__1.r - 1.f);
+		    q__2.r = t, q__2.i = 0.f;
+		    c_div(&q__1, &q__2, &d21);
+		    d21.r = q__1.r, d21.i = q__1.i;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + k * a_dim1;
+			r_cnjg(&q__2, &d21);
+			i__3 = j + kw * w_dim1;
+			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
+				q__2.r * q__3.i + q__2.i * q__3.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1;
+		clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		i__1 = k - 2;
+		clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    ++j;
+	}
+	++j;
+	if (jp != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+	}
+	if (j <= *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	i__1 = k + k * w_dim1;
+	i__2 = k + k * a_dim1;
+	r__1 = a[i__2].r;
+	w[i__1].r = r__1, w[i__1].i = 0.f;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
+		    w_dim1], &c__1);
+	}
+	i__1 = *n - k + 1;
+	i__2 = k - 1;
+	q__1.r = -1.f, q__1.i = -0.f;
+	cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+	i__1 = k + k * w_dim1;
+	i__2 = k + k * w_dim1;
+	r__1 = w[i__2].r;
+	w[i__1].r = r__1, w[i__1].i = 0.f;
+	i__1 = k + k * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + k * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + k * a_dim1;
+	    r__1 = a[i__2].r;
+	    a[i__1].r = r__1, a[i__1].i = 0.f;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - k;
+		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = imax - k;
+		clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
+		i__1 = imax + (k + 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		r__1 = a[i__2].r;
+		w[i__1].r = r__1, w[i__1].i = 0.f;
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
+			    imax + 1 + (k + 1) * w_dim1], &c__1);
+		}
+		i__1 = *n - k + 1;
+		i__2 = k - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
+			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
+			w_dim1], &c__1, (ftnlen)12);
+		i__1 = imax + (k + 1) * w_dim1;
+		i__2 = imax + (k + 1) * w_dim1;
+		r__1 = w[i__2].r;
+		w[i__1].r = r__1, w[i__1].i = 0.f;
+		i__1 = imax - k;
+		jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+			;
+		i__1 = jmax + (k + 1) * w_dim1;
+		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			jmax + (k + 1) * w_dim1]), dabs(r__2));
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+			    r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
+			    r__2));
+		    rowmax = dmax(r__3,r__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (k + 1) * w_dim1;
+		    if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
+			kp = imax;
+			i__1 = *n - k + 1;
+			ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
+				k * w_dim1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = kp - kk - 1;
+		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		i__1 = kp - kk - 1;
+		clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    i__1 = k + k * a_dim1;
+		    r1 = 1.f / a[i__1].r;
+		    i__1 = *n - k;
+		    csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    i__1 = *n - k;
+		    clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    r_cnjg(&q__2, &d21);
+		    c_div(&q__1, &w[k + k * w_dim1], &q__2);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1.f / (q__1.r - 1.f);
+		    q__2.r = t, q__2.i = 0.f;
+		    c_div(&q__1, &q__2, &d21);
+		    d21.r = q__1.r, d21.i = q__1.i;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			r_cnjg(&q__2, &d21);
+			i__3 = j + k * w_dim1;
+			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
+				q__2.r * q__3.i + q__2.i * q__3.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = *n - k;
+		clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		i__1 = *n - k - 1;
+		clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    --j;
+	}
+	--j;
+	if (jp != jj && j >= 1) {
+	    cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j >= 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
--- a/relapack/src/chetrf_rook.c
+++ b/relapack/src/chetrf_rook.c
@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *,
+    float *, const int *, int *, float *, const int *, int *);
+
+
+/** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's chetrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html
+ * */
+void RELAPACK_chetrf_rook(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CHETRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** chetrf_rook's recursive compute kernel */
+static void RELAPACK_chetrf_rook_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = CREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        float *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        float *const A_BL   = A                 + 2 * n1;
+        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        float *const A_BL_B = A                 + 2 * *n;
+        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        float *const Work_BL =              Work                    + 2 * n1;
+        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            float *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = CREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        float *const A_TL_T = A + 2 * *ldA * n_rest;
+        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        float *const Work_L  = Work;
+        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
--- a/relapack/src/chetrf_rook_rec2.c
+++ b/relapack/src/chetrf_rook_rec2.c
@ -0,0 +1,661 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
+ *
+ * This routine is a minor modification of LAPACK's clahef_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n,
+	int *nb, int *kb, complex *a, int *lda, int *ipiv,
+	complex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    float r__1, r__2;
+    complex q__1, q__2, q__3, q__4, q__5;
+
+    /* Builtin functions */
+    double sqrt(double), r_imag(complex *);
+    void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
+
+    /* Local variables */
+    static int j, k, p;
+    static float t, r1;
+    static complex d11, d21, d22;
+    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static logical done;
+    static int imax, jmax;
+    static float alpha;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+	    , complex *, int *, complex *, int *, complex *, complex *
+	    , int *, ftnlen);
+    static float sfmin;
+    extern /* Subroutine */ int ccopy_(int *, complex *, int *,
+	    complex *, int *);
+    static int itemp;
+    extern /* Subroutine */ int cswap_(int *, complex *, int *,
+	    complex *, int *);
+    static int kstep;
+    static float stemp, absakk;
+    extern /* Subroutine */ int clacgv_(int *, complex *, int *);
+    extern int icamax_(int *, complex *, int *);
+    extern double slamch_(char *, ftnlen);
+    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+	    *);
+    static float colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.f) + 1.f) / 8.f;
+    sfmin = slamch_("S", (ftnlen)1);
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	p = k;
+	if (k > 1) {
+	    i__1 = k - 1;
+	    ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
+		    c__1);
+	}
+	i__1 = k + kw * w_dim1;
+	i__2 = k + k * a_dim1;
+	r__1 = a[i__2].r;
+	w[i__1].r = r__1, w[i__1].i = 0.f;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	    i__1 = k + kw * w_dim1;
+	    i__2 = k + kw * w_dim1;
+	    r__1 = w[i__2].r;
+	    w[i__1].r = r__1, w[i__1].i = 0.f;
+	}
+	i__1 = k + kw * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + kw * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + kw * w_dim1;
+	    r__1 = w[i__2].r;
+	    a[i__1].r = r__1, a[i__1].i = 0.f;
+	    if (k > 1) {
+		i__1 = k - 1;
+		ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
+			&c__1);
+	    }
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L12:
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			    w_dim1 + 1], &c__1);
+		}
+		i__1 = imax + (kw - 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		r__1 = a[i__2].r;
+		w[i__1].r = r__1, w[i__1].i = 0.f;
+		i__1 = k - imax;
+		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		i__1 = k - imax;
+		clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    i__2 = imax + (kw - 1) * w_dim1;
+		    r__1 = w[i__2].r;
+		    w[i__1].r = r__1, w[i__1].i = 0.f;
+		}
+		if (imax != k) {
+		    i__1 = k - imax;
+		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
+			    w_dim1], &c__1);
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
+		} else {
+		    rowmax = 0.f;
+		}
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+		    i__1 = itemp + (kw - 1) * w_dim1;
+		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
+		    if (stemp > rowmax) {
+			rowmax = stemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (kw - 1) * w_dim1;
+		if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
+		    kp = imax;
+		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		}
+		if (! done) {
+		    goto L12;
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kstep == 2 && p != k) {
+		i__1 = p + p * a_dim1;
+		i__2 = k + k * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = k - 1 - p;
+		ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+			a_dim1], lda);
+		i__1 = k - 1 - p;
+		clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
+		if (p > 1) {
+		    i__1 = p - 1;
+		    ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
+			    1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
+			    1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+			 ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = kk - 1 - kp;
+		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		i__1 = kk - 1 - kp;
+		clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    i__1 = k + k * a_dim1;
+		    t = a[i__1].r;
+		    if (dabs(t) >= sfmin) {
+			r1 = 1.f / t;
+			i__1 = k - 1;
+			csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    } else {
+			i__1 = k - 1;
+			for (ii = 1; ii <= i__1; ++ii) {
+			    i__2 = ii + k * a_dim1;
+			    i__3 = ii + k * a_dim1;
+			    q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L14: */
+			}
+		    }
+		    i__1 = k - 1;
+		    clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		}
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    r_cnjg(&q__2, &d21);
+		    c_div(&q__1, &w[k + kw * w_dim1], &q__2);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1.f / (q__1.r - 1.f);
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d21);
+			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + k * a_dim1;
+			i__3 = j + kw * w_dim1;
+			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			r_cnjg(&q__5, &d21);
+			c_div(&q__2, &q__3, &q__5);
+			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1;
+		clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		i__1 = k - 2;
+		clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    ++j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	++j;
+	if (jp2 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	++jj;
+	if (kstep == 2 && jp1 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	if (j < *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	p = k;
+	i__1 = k + k * w_dim1;
+	i__2 = k + k * a_dim1;
+	r__1 = a[i__2].r;
+	w[i__1].r = r__1, w[i__1].i = 0.f;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
+		    w_dim1], &c__1);
+	}
+	if (k > 1) {
+	    i__1 = *n - k + 1;
+	    i__2 = k - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
+		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
+		    ftnlen)12);
+	    i__1 = k + k * w_dim1;
+	    i__2 = k + k * w_dim1;
+	    r__1 = w[i__2].r;
+	    w[i__1].r = r__1, w[i__1].i = 0.f;
+	}
+	i__1 = k + k * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + k * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + k * w_dim1;
+	    r__1 = w[i__2].r;
+	    a[i__1].r = r__1, a[i__1].i = 0.f;
+	    if (k < *n) {
+		i__1 = *n - k;
+		ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
+			a_dim1], &c__1);
+	    }
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L72:
+		i__1 = imax - k;
+		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = imax - k;
+		clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
+		i__1 = imax + (k + 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		r__1 = a[i__2].r;
+		w[i__1].r = r__1, w[i__1].i = 0.f;
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
+			    imax + 1 + (k + 1) * w_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = *n - k + 1;
+		    i__2 = k - 1;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
+			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
+			    1) * w_dim1], &c__1, (ftnlen)12);
+		    i__1 = imax + (k + 1) * w_dim1;
+		    i__2 = imax + (k + 1) * w_dim1;
+		    r__1 = w[i__2].r;
+		    w[i__1].r = r__1, w[i__1].i = 0.f;
+		}
+		if (imax != k) {
+		    i__1 = imax - k;
+		    jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+			    c__1);
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[jmax + (k + 1) * w_dim1]), dabs(r__2));
+		} else {
+		    rowmax = 0.f;
+		}
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+		    i__1 = itemp + (k + 1) * w_dim1;
+		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[itemp + (k + 1) * w_dim1]), dabs(r__2));
+		    if (stemp > rowmax) {
+			rowmax = stemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (k + 1) * w_dim1;
+		if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
+		    kp = imax;
+		    i__1 = *n - k + 1;
+		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    i__1 = *n - k + 1;
+		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		}
+		if (! done) {
+		    goto L72;
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kstep == 2 && p != k) {
+		i__1 = p + p * a_dim1;
+		i__2 = k + k * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = p - k - 1;
+		ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
+			a_dim1], lda);
+		i__1 = p - k - 1;
+		clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
+		if (p < *n) {
+		    i__1 = *n - p;
+		    ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
+			    * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+		}
+		cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = kp - kk - 1;
+		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		i__1 = kp - kk - 1;
+		clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    i__1 = k + k * a_dim1;
+		    t = a[i__1].r;
+		    if (dabs(t) >= sfmin) {
+			r1 = 1.f / t;
+			i__1 = *n - k;
+			csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    } else {
+			i__1 = *n;
+			for (ii = k + 1; ii <= i__1; ++ii) {
+			    i__2 = ii + k * a_dim1;
+			    i__3 = ii + k * a_dim1;
+			    q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L74: */
+			}
+		    }
+		    i__1 = *n - k;
+		    clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    r_cnjg(&q__2, &d21);
+		    c_div(&q__1, &w[k + k * w_dim1], &q__2);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1.f / (q__1.r - 1.f);
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			i__3 = j + k * w_dim1;
+			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			r_cnjg(&q__5, &d21);
+			c_div(&q__2, &q__3, &q__5);
+			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d21);
+			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = *n - k;
+		clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		i__1 = *n - k - 1;
+		clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    --j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	--j;
+	if (jp2 != jj && j >= 1) {
+	    cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	--jj;
+	if (kstep == 2 && jp1 != jj && j >= 1) {
+	    cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j > 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
--- a/relapack/src/clauum.c
+++ b/relapack/src/clauum.c
@ -0,0 +1,87 @@
+#include "relapack.h"
+
+static void RELAPACK_clauum_rec(const char *, const int *, float *,
+    const int *, int *);
+
+
+/** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
+ *
+ * This routine is functionally equivalent to LAPACK's clauum.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html
+ * */
+void RELAPACK_clauum(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CLAUUM", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** clauum's recursive compute kernel */
+static void RELAPACK_clauum_rec(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_CLAUUM, 1)) {
+        // Unblocked
+        LAPACK(clauu2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[] = { 1., 0. };
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info);
+
+    if (*uplo == 'L') {
+        // A_TL = A_TL + A_BL' * A_BL
+        BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
+        // A_BL = A_BR' * A_BL
+        BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TL = A_TL + A_TR * A_TR'
+        BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
+        // A_TR = A_TR * A_BR'
+        BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info);
+}
--- a/relapack/src/cpbtrf.c
+++ b/relapack/src/cpbtrf.c
@ -0,0 +1,157 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *,
+    float *, const int *, float *, const int *, int *);
+
+
+/** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's cpbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html
+ * */
+void RELAPACK_cpbtrf(
+    const char *uplo, const int *n, const int *kd,
+    float *Ab, const int *ldAb,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kd < 0)
+        *info = -3;
+    else if (*ldAb < *kd + 1)
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CPBTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Constant
+    const float ZERO[] = { 0., 0. };
+
+    // Allocate work space
+    const int n1 = CREC_SPLIT(*n);
+    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+    float *Work = malloc(mWork * nWork * 2 * sizeof(float));
+    LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
+
+    // Recursive kernel
+    RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
+
+    // Free work space
+    free(Work);
+}
+
+
+/** cpbtrf's recursive compute kernel */
+static void RELAPACK_cpbtrf_rec(
+    const char *uplo, const int *n, const int *kd,
+    float *Ab, const int *ldAb,
+    float *Work, const int *ldWork,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_CPBTRF, 1)) {
+        // Unblocked
+        LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
+
+    // Splitting
+    const int n1 = MIN(CREC_SPLIT(*n), *kd);
+    const int n2 = *n - n1;
+
+    // * *
+    // * Ab_BR
+    float *const Ab_BR = Ab + 2 * *ldAb * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    // Banded splitting
+    const int n21 = MIN(n2, *kd - n1);
+    const int n22 = MIN(n2 - n21, *kd);
+
+    //     n1    n21    n22
+    // n1  *     A_TRl  A_TRr
+    // n21 A_BLt A_BRtl A_BRtr
+    // n22 A_BLb A_BRbl A_BRbr
+    float *const A_TRl  = A_TR;
+    float *const A_TRr  = A_TR + 2 * *ldA * n21;
+    float *const A_BLt  = A_BL;
+    float *const A_BLb  = A_BL                   + 2 * n21;
+    float *const A_BRtl = A_BR;
+    float *const A_BRtr = A_BR + 2 * *ldA * n21;
+    float *const A_BRbl = A_BR                   + 2 * n21;
+    float *const A_BRbr = A_BR + 2 * *ldA * n21  + 2 * n21;
+
+    if (*uplo == 'L') {
+        // A_BLt = ABLt / A_TL'
+        BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
+        // A_BRtl = A_BRtl - A_BLt * A_BLt'
+        BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
+        // Work = A_BLb
+        LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
+        // Work = Work / A_TL'
+        BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRbl = A_BRbl - Work * A_BLt'
+        BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
+        // A_BRbr = A_BRbr - Work * Work'
+        BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_BLb = Work
+        LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
+    } else {
+        // A_TRl = A_TL' \ A_TRl
+        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+        // A_BRtl = A_BRtl - A_TRl' * A_TRl
+        BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
+        // Work = A_TRr
+        LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
+        // Work = A_TL' \ Work
+        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRtr = A_BRtr - A_TRl' * Work
+        BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
+        // A_BRbr = A_BRbr - Work' * Work
+        BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_TRr = Work
+        LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
+    }
+
+    // recursion(A_BR)
+    if (*kd > n1)
+        RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info);
+    else
+        RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
+    if (*info)
+        *info += n1;
+}
--- a/relapack/src/cpotrf.c
+++ b/relapack/src/cpotrf.c
@ -0,0 +1,92 @@
+#include "relapack.h"
+
+static void RELAPACK_cpotrf_rec(const char *, const int *, float *,
+        const int *, int *);
+
+
+/** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's cpotrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html
+ * */
+void RELAPACK_cpotrf(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CPOTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** cpotrf's recursive compute kernel */
+static void RELAPACK_cpotrf_rec(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_CPOTRF, 1)) {
+        // Unblocked
+        LAPACK(cpotf2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = A_BL / A_TL'
+        BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
+        // A_BR = A_BR - A_BL * A_BL'
+        BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
+    } else {
+        // A_TR = A_TL' \ A_TR
+        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+        // A_BR = A_BR - A_TR' * A_TR
+        BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}
--- a/relapack/src/csytrf.c
+++ b/relapack/src/csytrf.c
@ -0,0 +1,238 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *,
+    float *, const int *, int *, float *, const int *, int *);
+
+
+/** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's csytrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html
+ * */
+void RELAPACK_csytrf(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CSYTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy arguments
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** csytrf's recursive compute kernel */
+static void RELAPACK_csytrf_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_CSYTRF, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    // Loop iterator
+    int i;
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = CREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        float *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        float *const A_BL   = A                 + 2 * n1;
+        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        float *const A_BL_B = A                 + 2 * *n;
+        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        float *const Work_BL =              Work                    + 2 * n1;
+        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            float *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info  = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = CREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1  = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        float *const A_TL_T = A + 2 * *ldA * n_rest;
+        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        float *const Work_L  = Work;
+        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
--- a/relapack/src/csytrf_rec2.c
+++ b/relapack/src/csytrf_rec2.c
@ -0,0 +1,451 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's clasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int *
+	nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
+	int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    float r__1, r__2, r__3, r__4;
+    complex q__1, q__2, q__3;
+
+    /* Builtin functions */
+    double sqrt(double), r_imag(complex *);
+    void c_div(complex *, complex *, complex *);
+
+    /* Local variables */
+    static int j, k;
+    static complex t, r1, d11, d21, d22;
+    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static float alpha;
+    extern /* Subroutine */ int cscal_(int *, complex *, complex *,
+	    int *);
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+	    , complex *, int *, complex *, int *, complex *, complex *
+	    , int *, ftnlen), ccopy_(int *, complex *, int *,
+	    complex *, int *), cswap_(int *, complex *, int *,
+	    complex *, int *);
+    static int kstep;
+    static float absakk;
+    extern int icamax_(int *, complex *, int *);
+    static float colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.f) + 1.f) / 8.f;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	if (k < *n) {
+	    i__1 = *n - k;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	}
+	kstep = 1;
+	i__1 = k + kw * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
+		w_dim1]), dabs(r__2));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + kw * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = k - imax;
+		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		}
+		i__1 = k - imax;
+		jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+			 &c__1);
+		i__1 = jmax + (kw - 1) * w_dim1;
+		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			jmax + (kw - 1) * w_dim1]), dabs(r__2));
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+			    r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
+			    r__2));
+		    rowmax = dmax(r__3,r__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			    imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha *
+			     rowmax) {
+			kp = imax;
+			ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+				w_dim1 + 1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = kk - 1 - kp;
+		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+		r1.r = q__1.r, r1.i = q__1.i;
+		i__1 = k - 1;
+		cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    c_div(&q__1, &w[k + kw * w_dim1], &d21);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+		    c_div(&q__1, &c_b1, &q__2);
+		    t.r = q__1.r, t.i = q__1.i;
+		    c_div(&q__1, &t, &d21);
+		    d21.r = q__1.r, d21.i = q__1.i;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + k * a_dim1;
+			i__3 = j + kw * w_dim1;
+			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    ++j;
+	}
+	++j;
+	if (jp != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+	}
+	if (j < *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	i__1 = *n - k + 1;
+	ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+	i__1 = *n - k + 1;
+	i__2 = k - 1;
+	q__1.r = -1.f, q__1.i = -0.f;
+	cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+	kstep = 1;
+	i__1 = k + k * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
+		w_dim1]), dabs(r__2));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + k * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - k;
+		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = *n - imax + 1;
+		ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+			1) * w_dim1], &c__1);
+		i__1 = *n - k + 1;
+		i__2 = k - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
+			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
+			w_dim1], &c__1, (ftnlen)12);
+		i__1 = imax - k;
+		jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+			;
+		i__1 = jmax + (k + 1) * w_dim1;
+		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			jmax + (k + 1) * w_dim1]), dabs(r__2));
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+			    r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
+			    r__2));
+		    rowmax = dmax(r__3,r__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (k + 1) * w_dim1;
+		    if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			    imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha *
+			    rowmax) {
+			kp = imax;
+			i__1 = *n - k + 1;
+			ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
+				k * w_dim1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = kp - kk - 1;
+		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+		    r1.r = q__1.r, r1.i = q__1.i;
+		    i__1 = *n - k;
+		    cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k + k * w_dim1], &d21);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+		    c_div(&q__1, &c_b1, &q__2);
+		    t.r = q__1.r, t.i = q__1.i;
+		    c_div(&q__1, &t, &d21);
+		    d21.r = q__1.r, d21.i = q__1.i;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			i__3 = j + k * w_dim1;
+			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    --j;
+	}
+	--j;
+	if (jp != jj && j >= 1) {
+	    cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j > 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
--- a/relapack/src/csytrf_rook.c
+++ b/relapack/src/csytrf_rook.c
@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *,
+    float *, const int *, int *, float *, const int *, int *);
+
+
+/** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's csytrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html
+ * */
+void RELAPACK_csytrf_rook(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CSYTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** csytrf_rook's recursive compute kernel */
+static void RELAPACK_csytrf_rook_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = CREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        float *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2   = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        float *const A_BL   = A                 + 2 * n1;
+        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        float *const A_BL_B = A                 + 2 * *n;
+        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        float *const Work_BL =              Work                    + 2 * n1;
+        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            float *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info  = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = CREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        float *const A_TL_T = A + 2 * *ldA * n_rest;
+        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        float *const Work_L  = Work;
+        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
--- a/relapack/src/csytrf_rook_rec2.c
+++ b/relapack/src/csytrf_rook_rec2.c
@ -0,0 +1,565 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's clasyf_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n,
+	int *nb, int *kb, complex *a, int *lda, int *ipiv,
+	complex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    float r__1, r__2;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    double sqrt(double), r_imag(complex *);
+    void c_div(complex *, complex *, complex *);
+
+    /* Local variables */
+    static int j, k, p;
+    static complex t, r1, d11, d12, d21, d22;
+    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static logical done;
+    static int imax, jmax;
+    static float alpha;
+    extern /* Subroutine */ int cscal_(int *, complex *, complex *,
+	    int *);
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+	    , complex *, int *, complex *, int *, complex *, complex *
+	    , int *, ftnlen);
+    static float sfmin;
+    extern /* Subroutine */ int ccopy_(int *, complex *, int *,
+	    complex *, int *);
+    static int itemp;
+    extern /* Subroutine */ int cswap_(int *, complex *, int *,
+	    complex *, int *);
+    static int kstep;
+    static float stemp, absakk;
+    extern int icamax_(int *, complex *, int *);
+    extern double slamch_(char *, ftnlen);
+    static float colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.f) + 1.f) / 8.f;
+    sfmin = slamch_("S", (ftnlen)1);
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	p = k;
+	ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	if (k < *n) {
+	    i__1 = *n - k;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	}
+	i__1 = k + kw * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
+		w_dim1]), dabs(r__2));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + kw * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L12:
+		ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = k - imax;
+		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		}
+		if (imax != k) {
+		    i__1 = k - imax;
+		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
+			    w_dim1], &c__1);
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
+		} else {
+		    rowmax = 0.f;
+		}
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+		    i__1 = itemp + (kw - 1) * w_dim1;
+		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
+		    if (stemp > rowmax) {
+			rowmax = stemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (kw - 1) * w_dim1;
+		if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha *
+			rowmax)) {
+		    kp = imax;
+		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		}
+		if (! done) {
+		    goto L12;
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kstep == 2 && p != k) {
+		i__1 = k - p;
+		ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+			a_dim1], lda);
+		ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
+			c__1);
+		i__1 = *n - k + 1;
+		cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
+			lda);
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+			 ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + k * a_dim1;
+		i__2 = kk + k * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = k - 1 - kp;
+		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
+			c__1);
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
+			 lda);
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    i__1 = k + k * a_dim1;
+		    if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+			    + k * a_dim1]), dabs(r__2)) >= sfmin) {
+			c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+			r1.r = q__1.r, r1.i = q__1.i;
+			i__1 = k - 1;
+			cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    } else /* if(complicated condition) */ {
+			i__1 = k + k * a_dim1;
+			if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
+			    i__1 = k - 1;
+			    for (ii = 1; ii <= i__1; ++ii) {
+				i__2 = ii + k * a_dim1;
+				c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
+					a_dim1]);
+				a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L14: */
+			    }
+			}
+		    }
+		}
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d12.r = w[i__1].r, d12.i = w[i__1].i;
+		    c_div(&q__1, &w[k + kw * w_dim1], &d12);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+		    c_div(&q__1, &c_b1, &q__2);
+		    t.r = q__1.r, t.i = q__1.i;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d12);
+			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+				q__2.i + t.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + k * a_dim1;
+			i__3 = j + kw * w_dim1;
+			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d12);
+			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+				q__2.i + t.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    ++j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	++j;
+	if (jp2 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	jj = j - 1;
+	if (jp1 != jj && kstep == 2) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	if (j <= *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	p = k;
+	i__1 = *n - k + 1;
+	ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+	if (k > 1) {
+	    i__1 = *n - k + 1;
+	    i__2 = k - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
+		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
+		    ftnlen)12);
+	}
+	i__1 = k + k * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
+		w_dim1]), dabs(r__2));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + k * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = *n - k + 1;
+	    ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+		    c__1);
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L72:
+		i__1 = imax - k;
+		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = *n - imax + 1;
+		ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+			1) * w_dim1], &c__1);
+		if (k > 1) {
+		    i__1 = *n - k + 1;
+		    i__2 = k - 1;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
+			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
+			    1) * w_dim1], &c__1, (ftnlen)12);
+		}
+		if (imax != k) {
+		    i__1 = imax - k;
+		    jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+			    c__1);
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[jmax + (k + 1) * w_dim1]), dabs(r__2));
+		} else {
+		    rowmax = 0.f;
+		}
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+		    i__1 = itemp + (k + 1) * w_dim1;
+		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[itemp + (k + 1) * w_dim1]), dabs(r__2));
+		    if (stemp > rowmax) {
+			rowmax = stemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (k + 1) * w_dim1;
+		if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha *
+			rowmax)) {
+		    kp = imax;
+		    i__1 = *n - k + 1;
+		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    i__1 = *n - k + 1;
+		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		}
+		if (! done) {
+		    goto L72;
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kstep == 2 && p != k) {
+		i__1 = p - k;
+		ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
+			lda);
+		i__1 = *n - p + 1;
+		ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
+			c__1);
+		cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+		cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + k * a_dim1;
+		i__2 = kk + k * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = kp - k - 1;
+		ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
+			* a_dim1], lda);
+		i__1 = *n - kp + 1;
+		ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
+			a_dim1], &c__1);
+		cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    i__1 = k + k * a_dim1;
+		    if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+			    + k * a_dim1]), dabs(r__2)) >= sfmin) {
+			c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+			r1.r = q__1.r, r1.i = q__1.i;
+			i__1 = *n - k;
+			cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    } else /* if(complicated condition) */ {
+			i__1 = k + k * a_dim1;
+			if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
+			    i__1 = *n;
+			    for (ii = k + 1; ii <= i__1; ++ii) {
+				i__2 = ii + k * a_dim1;
+				c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
+					a_dim1]);
+				a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L74: */
+			    }
+			}
+		    }
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k + k * w_dim1], &d21);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+		    c_div(&q__1, &c_b1, &q__2);
+		    t.r = q__1.r, t.i = q__1.i;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			i__3 = j + k * w_dim1;
+			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d21);
+			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+				q__2.i + t.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d21);
+			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+				q__2.i + t.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    --j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	--j;
+	if (jp2 != jj && j >= 1) {
+	    cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	jj = j + 1;
+	if (jp1 != jj && kstep == 2) {
+	    cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j >= 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
--- a/relapack/src/ctgsyl.c
+++ b/relapack/src/ctgsyl.c
@ -0,0 +1,268 @@
+#include "relapack.h"
+#include <math.h>
+
+static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *,
+    const int *, const float *, const int *, const float *, const int *,
+    float *, const int *, const float *, const int *, const float *,
+    const int *, float *, const int *, float *, float *, float *, int *);
+
+
+/** CTGSYL solves the generalized Sylvester equation.
+ *
+ * This routine is functionally equivalent to LAPACK's ctgsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html
+ * */
+void RELAPACK_ctgsyl(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC,
+    const float *D, const int *ldD, const float *E, const int *ldE,
+    float *F, const int *ldF,
+    float *scale, float *dif,
+    float *Work, const int *lWork, int *iWork, int *info
+) {
+
+    // Parse arguments
+    const int notran = LAPACK(lsame)(trans, "N");
+    const int tran = LAPACK(lsame)(trans, "C");
+
+    // Compute work buffer size
+    int lwmin = 1;
+    if (notran && (*ijob == 1 || *ijob == 2))
+        lwmin = MAX(1, 2 * *m * *n);
+    *info = 0;
+
+    // Check arguments
+    if (!tran && !notran)
+        *info = -1;
+    else if (notran && (*ijob < 0 || *ijob > 4))
+        *info = -2;
+    else if (*m <= 0)
+        *info = -3;
+    else if (*n <= 0)
+        *info = -4;
+    else if (*ldA < MAX(1, *m))
+        *info = -6;
+    else if (*ldB < MAX(1, *n))
+        *info = -8;
+    else if (*ldC < MAX(1, *m))
+        *info = -10;
+    else if (*ldD < MAX(1, *m))
+        *info = -12;
+    else if (*ldE < MAX(1, *n))
+        *info = -14;
+    else if (*ldF < MAX(1, *m))
+        *info = -16;
+    else if (*lWork < lwmin && *lWork != -1)
+        *info = -20;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CTGSYL", &minfo);
+        return;
+    }
+
+    if (*lWork == -1) {
+        // Work size query
+        *Work = lwmin;
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleantrans = notran ? 'N' : 'C';
+
+    // Constant
+    const float ZERO[] = { 0., 0. };
+
+    int isolve = 1;
+    int ifunc  = 0;
+    if (notran) {
+        if (*ijob >= 3) {
+            ifunc = *ijob - 2;
+            LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
+            LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
+        } else if (*ijob >= 1)
+            isolve = 2;
+    }
+
+    float scale2;
+    int iround;
+    for (iround = 1; iround <= isolve; iround++) {
+        *scale = 1;
+        float dscale = 0;
+        float dsum   = 1;
+        RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
+        if (dscale != 0) {
+            if (*ijob == 1 || *ijob == 3)
+                *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
+            else
+                *dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
+        }
+        if (isolve == 2) {
+            if (iround == 1) {
+                if (notran)
+                    ifunc = *ijob;
+                scale2 = *scale;
+                LAPACK(clacpy)("F", m, n, C, ldC, Work, m);
+                LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
+                LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
+                LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
+            } else {
+                LAPACK(clacpy)("F", m, n, Work, m, C, ldC);
+                LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
+                *scale = scale2;
+            }
+        }
+    }
+}
+
+
+/** ctgsyl's recursive vompute kernel */
+static void RELAPACK_ctgsyl_rec(
+    const char *trans, const int *ifunc, const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC,
+    const float *D, const int *ldD, const float *E, const int *ldE,
+    float *F, const int *ldF,
+    float *scale, float *dsum, float *dscale,
+    int *info
+) {
+
+    if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) {
+        // Unblocked
+        LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    // Outputs
+    float scale1[] = { 1., 0. };
+    float scale2[] = { 1., 0. };
+    int   info1[]  = { 0 };
+    int   info2[]  = { 0 };
+
+    if (*m > *n) {
+        // Splitting
+        const int m1 = CREC_SPLIT(*m);
+        const int m2 = *m - m1;
+
+        // A_TL A_TR
+        // 0    A_BR
+        const float *const A_TL = A;
+        const float *const A_TR = A + 2 * *ldA * m1;
+        const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
+
+        // C_T
+        // C_B
+        float *const C_T = C;
+        float *const C_B = C + 2 * m1;
+
+        // D_TL D_TR
+        // 0    D_BR
+        const float *const D_TL = D;
+        const float *const D_TR = D + 2 * *ldD * m1;
+        const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1;
+
+        // F_T
+        // F_B
+        float *const F_T = F;
+        float *const F_B = F + 2 * m1;
+
+        if (*trans == 'N') {
+            // recursion(A_BR, B, C_B, D_BR, E, F_B)
+            RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
+            // C_T = C_T - A_TR * C_B
+            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+            // F_T = F_T - D_TR * C_B
+            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
+            // recursion(A_TL, B, C_T, D_TL, E, F_T)
+            RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
+            }
+        } else {
+            // recursion(A_TL, B, C_T, D_TL, E, F_T)
+            RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
+            // apply scale
+            if (scale1[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
+            // C_B = C_B - A_TR^H * C_T
+            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+            // C_B = C_B - D_TR^H * F_T
+            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
+            // recursion(A_BR, B, C_B, D_BR, E, F_B)
+            RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
+            }
+        }
+    } else {
+        // Splitting
+        const int n1 = CREC_SPLIT(*n);
+        const int n2 = *n - n1;
+
+        // B_TL B_TR
+        // 0    B_BR
+        const float *const B_TL = B;
+        const float *const B_TR = B + 2 * *ldB * n1;
+        const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+        // C_L C_R
+        float *const C_L = C;
+        float *const C_R = C + 2 * *ldC * n1;
+
+        // E_TL E_TR
+        // 0    E_BR
+        const float *const E_TL = E;
+        const float *const E_TR = E + 2 * *ldE * n1;
+        const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1;
+
+        // F_L F_R
+        float *const F_L = F;
+        float *const F_R = F + 2 * *ldF * n1;
+
+        if (*trans == 'N') {
+            // recursion(A, B_TL, C_L, D, E_TL, F_L)
+            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
+            // C_R = C_R + F_L * B_TR
+            BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
+            // F_R = F_R + F_L * E_TR
+            BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
+            // recursion(A, B_BR, C_R, D, E_BR, F_R)
+            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
+            }
+        } else {
+            // recursion(A, B_BR, C_R, D, E_BR, F_R)
+            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
+            // apply scale
+            if (scale1[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
+            // F_L = F_L + C_R * B_TR
+            BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
+            // F_L = F_L + F_R * E_TR
+            BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
+            // recursion(A, B_TL, C_L, D, E_TL, F_L)
+            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
+            }
+        }
+    }
+
+    *scale = scale1[0] * scale2[0];
+    *info  = info1[0] || info2[0];
+}
--- a/relapack/src/ctrsyl.c
+++ b/relapack/src/ctrsyl.c
@ -0,0 +1,163 @@
+#include "relapack.h"
+
+static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *,
+    const int *, const int *, const float *, const int *, const float *,
+    const int *, float *, const int *, float *, int *);
+
+
+/** CTRSYL solves the complex Sylvester matrix equation.
+ *
+ * This routine is functionally equivalent to LAPACK's ctrsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html
+ * */
+void RELAPACK_ctrsyl(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC, float *scale,
+    int *info
+) {
+
+    // Check arguments
+    const int notransA = LAPACK(lsame)(tranA, "N");
+    const int ctransA = LAPACK(lsame)(tranA, "C");
+    const int notransB = LAPACK(lsame)(tranB, "N");
+    const int ctransB = LAPACK(lsame)(tranB, "C");
+    *info = 0;
+    if (!ctransA && !notransA)
+        *info = -1;
+    else if (!ctransB && !notransB)
+        *info = -2;
+    else if (*isgn != 1 && *isgn != -1)
+        *info = -3;
+    else if (*m < 0)
+        *info = -4;
+    else if (*n < 0)
+        *info = -5;
+    else if (*ldA < MAX(1, *m))
+        *info = -7;
+    else if (*ldB < MAX(1, *n))
+        *info = -9;
+    else if (*ldC < MAX(1, *m))
+        *info = -11;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CTRSYL", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleantranA = notransA ? 'N' : 'C';
+    const char cleantranB = notransB ? 'N' : 'C';
+
+    // Recursive kernel
+    RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+
+
+/** ctrsyl's recursive compute kernel */
+static void RELAPACK_ctrsyl_rec(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC, float *scale,
+    int *info
+) {
+
+    if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) {
+        // Unblocked
+        RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const float MSGN[] = { -*isgn, 0. };
+    const int   iONE[] = { 1 };
+
+    // Outputs
+    float scale1[] = { 1., 0. };
+    float scale2[] = { 1., 0. };
+    int   info1[]  = { 0 };
+    int   info2[]  = { 0 };
+
+    if (*m > *n) {
+        // Splitting
+        const int m1 = CREC_SPLIT(*m);
+        const int m2 = *m - m1;
+
+        // A_TL A_TR
+        // 0    A_BR
+        const float *const A_TL = A;
+        const float *const A_TR = A + 2 * *ldA * m1;
+        const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
+
+        // C_T
+        // C_B
+        float *const C_T = C;
+        float *const C_B = C + 2 * m1;
+
+        if (*tranA == 'N') {
+            // recusion(A_BR, B, C_B)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
+            // C_T = C_T - A_TR * C_B
+            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+            // recusion(A_TL, B, C_T)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+        } else {
+            // recusion(A_TL, B, C_T)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
+            // C_B = C_B - A_TR' * C_T
+            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+            // recusion(A_BR, B, C_B)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
+        }
+    } else {
+        // Splitting
+        const int n1 = CREC_SPLIT(*n);
+        const int n2 = *n - n1;
+
+        // B_TL B_TR
+        // 0    B_BR
+        const float *const B_TL = B;
+        const float *const B_TR = B + 2 * *ldB * n1;
+        const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+        // C_L C_R
+        float *const C_L = C;
+        float *const C_R = C + 2 * *ldC * n1;
+
+        if (*tranB == 'N') {
+            // recusion(A, B_TL, C_L)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
+            // C_R = C_R -/+ C_L * B_TR
+            BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
+            // recusion(A, B_BR, C_R)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+        } else {
+            // recusion(A, B_BR, C_R)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
+            // C_L = C_L -/+ C_R * B_TR'
+            BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
+            // recusion(A, B_TL, C_L)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+        }
+    }
+
+    *scale = scale1[0] * scale2[0];
+    *info  = info1[0] || info2[0];
+}
--- a/relapack/src/ctrsyl_rec2.c
+++ b/relapack/src/ctrsyl_rec2.c
@ -0,0 +1,392 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "../config.h"
+#include "f2c.h"
+
+#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
+complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
+    extern void cdotu_(complex *, int *, complex *, int *, complex *, int *);
+    complex result;
+    cdotu_(&result, n, x, incx, y, incy);
+    return result;
+}
+#define cdotu_ cdotu_fun
+
+complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
+    extern void cdotc_(complex *, int *, complex *, int *, complex *, int *);
+    complex result;
+    cdotc_(&result, n, x, incx, y, incy);
+    return result;
+}
+#define cdotc_ cdotc_fun
+#endif
+
+#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
+complex cladiv_fun(complex *a, complex *b) {
+    extern void cladiv_(complex *, complex *, complex *);
+    complex result;
+    cladiv_(&result, a, b);
+    return result;
+}
+#define cladiv_ cladiv_fun
+#endif
+
+/* Table of constant values */
+
+static int c__1 = 1;
+
+/** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
+ *
+ * This routine is an exact copy of LAPACK's ctrsyl.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * */
+/* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int
+	*isgn, int *m, int *n, complex *a, int *lda, complex *b,
+	int *ldb, complex *c__, int *ldc, float *scale, int *info,
+	ftnlen trana_len, ftnlen tranb_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4;
+    float r__1, r__2;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    float r_imag(complex *);
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    static int j, k, l;
+    static complex a11;
+    static float db;
+    static complex x11;
+    static float da11;
+    static complex vec;
+    static float dum[1], eps, sgn, smin;
+    static complex suml, sumr;
+    /* Complex */ complex cdotc_(int *, complex *, int
+	    *, complex *, int *);
+    extern int lsame_(char *, char *, ftnlen, ftnlen);
+    /* Complex */ complex cdotu_(int *, complex *, int
+	    *, complex *, int *);
+    extern /* Subroutine */ int slabad_(float *, float *);
+    extern float clange_(char *, int *, int *, complex *,
+	    int *, float *, ftnlen);
+    /* Complex */ complex cladiv_(complex *, complex *);
+    static float scaloc;
+    extern float slamch_(char *, ftnlen);
+    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+	    *), xerbla_(char *, int *, ftnlen);
+    static float bignum;
+    static int notrna, notrnb;
+    static float smlnum;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
+    notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
+    *info = 0;
+    if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
+	*info = -1;
+    } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
+	*info = -2;
+    } else if (*isgn != 1 && *isgn != -1) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,*m)) {
+	*info = -7;
+    } else if (*ldb < max(1,*n)) {
+	*info = -9;
+    } else if (*ldc < max(1,*m)) {
+	*info = -11;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CTRSY2", &i__1, (ftnlen)6);
+	return;
+    }
+    *scale = 1.f;
+    if (*m == 0 || *n == 0) {
+	return;
+    }
+    eps = slamch_("P", (ftnlen)1);
+    smlnum = slamch_("S", (ftnlen)1);
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
+    smlnum = smlnum * (float) (*m * *n) / eps;
+    bignum = 1.f / smlnum;
+/* Computing MAX */
+    r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, (
+	    ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n,
+	    &b[b_offset], ldb, dum, (ftnlen)1);
+    smin = dmax(r__1,r__2);
+    sgn = (float) (*isgn);
+    if (notrna && notrnb) {
+	i__1 = *n;
+	for (l = 1; l <= i__1; ++l) {
+	    for (k = *m; k >= 1; --k) {
+		i__2 = *m - k;
+/* Computing MIN */
+		i__3 = k + 1;
+/* Computing MIN */
+		i__4 = k + 1;
+		q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
+			min(i__4,*m) + l * c_dim1], &c__1);
+		suml.r = q__1.r, suml.i = q__1.i;
+		i__2 = l - 1;
+		q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
+			, &c__1);
+		sumr.r = q__1.r, sumr.i = q__1.i;
+		i__2 = k + l * c_dim1;
+		q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
+		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+		q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
+		vec.r = q__1.r, vec.i = q__1.i;
+		scaloc = 1.f;
+		i__2 = k + k * a_dim1;
+		i__3 = l + l * b_dim1;
+		q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i;
+		q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i;
+		a11.r = q__1.r, a11.i = q__1.i;
+		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+			dabs(r__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.f;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+			r__2));
+		if (da11 < 1.f && db > 1.f) {
+		    if (db > bignum * da11) {
+			scaloc = 1.f / db;
+		    }
+		}
+		q__3.r = scaloc, q__3.i = 0.f;
+		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+			q__3.i + vec.i * q__3.r;
+		q__1 = cladiv_(&q__2, &a11);
+		x11.r = q__1.r, x11.i = q__1.i;
+		if (scaloc != 1.f) {
+		    i__2 = *n;
+		    for (j = 1; j <= i__2; ++j) {
+			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L10: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__2 = k + l * c_dim1;
+		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
+/* L20: */
+	    }
+/* L30: */
+	}
+    } else if (! notrna && notrnb) {
+	i__1 = *n;
+	for (l = 1; l <= i__1; ++l) {
+	    i__2 = *m;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = k - 1;
+		q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
+			c_dim1 + 1], &c__1);
+		suml.r = q__1.r, suml.i = q__1.i;
+		i__3 = l - 1;
+		q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
+			, &c__1);
+		sumr.r = q__1.r, sumr.i = q__1.i;
+		i__3 = k + l * c_dim1;
+		q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
+		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+		q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i;
+		vec.r = q__1.r, vec.i = q__1.i;
+		scaloc = 1.f;
+		r_cnjg(&q__2, &a[k + k * a_dim1]);
+		i__3 = l + l * b_dim1;
+		q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
+		q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		a11.r = q__1.r, a11.i = q__1.i;
+		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+			dabs(r__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.f;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+			r__2));
+		if (da11 < 1.f && db > 1.f) {
+		    if (db > bignum * da11) {
+			scaloc = 1.f / db;
+		    }
+		}
+		q__3.r = scaloc, q__3.i = 0.f;
+		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+			q__3.i + vec.i * q__3.r;
+		q__1 = cladiv_(&q__2, &a11);
+		x11.r = q__1.r, x11.i = q__1.i;
+		if (scaloc != 1.f) {
+		    i__3 = *n;
+		    for (j = 1; j <= i__3; ++j) {
+			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L40: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__3 = k + l * c_dim1;
+		c__[i__3].r = x11.r, c__[i__3].i = x11.i;
+/* L50: */
+	    }
+/* L60: */
+	}
+    } else if (! notrna && ! notrnb) {
+	for (l = *n; l >= 1; --l) {
+	    i__1 = *m;
+	    for (k = 1; k <= i__1; ++k) {
+		i__2 = k - 1;
+		q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
+			c_dim1 + 1], &c__1);
+		suml.r = q__1.r, suml.i = q__1.i;
+		i__2 = *n - l;
+/* Computing MIN */
+		i__3 = l + 1;
+/* Computing MIN */
+		i__4 = l + 1;
+		q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
+			l + min(i__4,*n) * b_dim1], ldb);
+		sumr.r = q__1.r, sumr.i = q__1.i;
+		i__2 = k + l * c_dim1;
+		r_cnjg(&q__4, &sumr);
+		q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
+		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+		q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
+		vec.r = q__1.r, vec.i = q__1.i;
+		scaloc = 1.f;
+		i__2 = k + k * a_dim1;
+		i__3 = l + l * b_dim1;
+		q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
+		q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i;
+		r_cnjg(&q__1, &q__2);
+		a11.r = q__1.r, a11.i = q__1.i;
+		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+			dabs(r__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.f;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+			r__2));
+		if (da11 < 1.f && db > 1.f) {
+		    if (db > bignum * da11) {
+			scaloc = 1.f / db;
+		    }
+		}
+		q__3.r = scaloc, q__3.i = 0.f;
+		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+			q__3.i + vec.i * q__3.r;
+		q__1 = cladiv_(&q__2, &a11);
+		x11.r = q__1.r, x11.i = q__1.i;
+		if (scaloc != 1.f) {
+		    i__2 = *n;
+		    for (j = 1; j <= i__2; ++j) {
+			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L70: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__2 = k + l * c_dim1;
+		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
+/* L80: */
+	    }
+/* L90: */
+	}
+    } else if (notrna && ! notrnb) {
+	for (l = *n; l >= 1; --l) {
+	    for (k = *m; k >= 1; --k) {
+		i__1 = *m - k;
+/* Computing MIN */
+		i__2 = k + 1;
+/* Computing MIN */
+		i__3 = k + 1;
+		q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
+			min(i__3,*m) + l * c_dim1], &c__1);
+		suml.r = q__1.r, suml.i = q__1.i;
+		i__1 = *n - l;
+/* Computing MIN */
+		i__2 = l + 1;
+/* Computing MIN */
+		i__3 = l + 1;
+		q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
+			l + min(i__3,*n) * b_dim1], ldb);
+		sumr.r = q__1.r, sumr.i = q__1.i;
+		i__1 = k + l * c_dim1;
+		r_cnjg(&q__4, &sumr);
+		q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
+		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+		q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i;
+		vec.r = q__1.r, vec.i = q__1.i;
+		scaloc = 1.f;
+		i__1 = k + k * a_dim1;
+		r_cnjg(&q__3, &b[l + l * b_dim1]);
+		q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i;
+		q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i;
+		a11.r = q__1.r, a11.i = q__1.i;
+		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+			dabs(r__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.f;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+			r__2));
+		if (da11 < 1.f && db > 1.f) {
+		    if (db > bignum * da11) {
+			scaloc = 1.f / db;
+		    }
+		}
+		q__3.r = scaloc, q__3.i = 0.f;
+		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+			q__3.i + vec.i * q__3.r;
+		q__1 = cladiv_(&q__2, &a11);
+		x11.r = q__1.r, x11.i = q__1.i;
+		if (scaloc != 1.f) {
+		    i__1 = *n;
+		    for (j = 1; j <= i__1; ++j) {
+			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L100: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__1 = k + l * c_dim1;
+		c__[i__1].r = x11.r, c__[i__1].i = x11.i;
+/* L110: */
+	    }
+/* L120: */
+	}
+    }
+    return;
+}
--- a/relapack/src/ctrtri.c
+++ b/relapack/src/ctrtri.c
@ -0,0 +1,107 @@
+#include "relapack.h"
+
+static void RELAPACK_ctrtri_rec(const char *, const char *, const int *,
+    float *, const int *, int *);
+
+
+/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's ctrtri.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html
+ * */
+void RELAPACK_ctrtri(
+    const char *uplo, const char *diag, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int nounit = LAPACK(lsame)(diag, "N");
+    const int unit = LAPACK(lsame)(diag, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (!nounit && !unit)
+        *info = -2;
+    else if (*n < 0)
+        *info = -3;
+    else if (*ldA < MAX(1, *n))
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CTRTRI", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower  ? 'L' : 'U';
+    const char cleandiag = nounit ? 'N' : 'U';
+
+    // check for singularity
+    if (nounit) {
+        int i;
+        for (i = 0; i < *n; i++)
+            if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
+                *info = i;
+                return;
+            }
+    }
+
+    // Recursive kernel
+    RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
+}
+
+
+/** ctrtri's recursive compute kernel */
+static void RELAPACK_ctrtri_rec(
+    const char *uplo, const char *diag, const int *n,
+    float *A, const int *ldA,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_CTRTRI, 1)) {
+        // Unblocked
+        LAPACK(ctrti2)(uplo, diag, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = - A_BL * A_TL
+        BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
+        // A_BL = A_BR \ A_BL
+        BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TR = - A_TL * A_TR
+        BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
+        // A_TR = A_TR / A_BR
+        BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}
--- a/relapack/src/dgbtrf.c
+++ b/relapack/src/dgbtrf.c
@ -0,0 +1,227 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *,
+    const int *, double *, const int *, int *, double *, const int *, double *,
+    const int *, int *);
+
+
+/** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's dgbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html
+ * */
+void RELAPACK_dgbtrf(
+    const int *m, const int *n, const int *kl, const int *ku,
+    double *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kl < 0)
+        *info = -3;
+    else if (*ku < 0)
+        *info = -4;
+    else if (*ldAb < 2 * *kl + *ku + 1)
+        *info = -6;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DGBTRF", &minfo);
+        return;
+    }
+
+    // Constant
+    const double ZERO[] = { 0. };
+
+    // Result upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    double *const A = Ab + kv;
+
+    // Zero upper diagonal fill-in elements
+    int i, j;
+    for (j = 0; j < *n; j++) {
+        double *const A_j = A + *ldA * j;
+        for (i = MAX(0, j - kv); i < j - *ku; i++)
+            A_j[i] = 0.;
+    }
+
+    // Allocate work space
+    const int n1 = DREC_SPLIT(*n);
+    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+    const int nWorkl = (kv > n1) ? n1 : kv;
+    const int mWorku = (*kl > n1) ? n1 : *kl;
+    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    double *Workl = malloc(mWorkl * nWorkl * sizeof(double));
+    double *Worku = malloc(mWorku * nWorku * sizeof(double));
+    LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
+    LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
+
+    // Recursive kernel
+    RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
+
+    // Free work space
+    free(Workl);
+    free(Worku);
+}
+
+
+/** dgbtrf's recursive compute kernel */
+static void RELAPACK_dgbtrf_rec(
+    const int *m, const int *n, const int *kl, const int *ku,
+    double *Ab, const int *ldAb, int *ipiv,
+    double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_DGBTRF, 1)) {
+        // Unblocked
+        LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+    const int    iONE[] = { 1 };
+
+    // Loop iterators
+    int i, j;
+
+    // Output upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    double *const A = Ab + kv;
+
+    // Splitting
+    const int n1  = MIN(DREC_SPLIT(*n), *kl);
+    const int n2  = *n - n1;
+    const int m1  = MIN(n1, *m);
+    const int m2  = *m - m1;
+    const int mn1 = MIN(m1, n1);
+    const int mn2 = MIN(m2, n2);
+
+    // Ab_L *
+    //      Ab_BR
+    double *const Ab_L  = Ab;
+    double *const Ab_BR = Ab + *ldAb * n1;
+
+    // A_L A_R
+    double *const A_L = A;
+    double *const A_R = A + *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + m1;
+    double *const A_BR = A + *ldA * n1 + m1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // Banded splitting
+    const int n21 = MIN(n2, kv - n1);
+    const int n22 = MIN(n2 - n21, n1);
+    const int m21 = MIN(m2, *kl - m1);
+    const int m22 = MIN(m2 - m21, m1);
+
+    //   n1 n21  n22
+    // m *  A_Rl ARr
+    double *const A_Rl = A_R;
+    double *const A_Rr = A_R + *ldA * n21;
+
+    //     n1    n21    n22
+    // m1  *     A_TRl  A_TRr
+    // m21 A_BLt A_BRtl A_BRtr
+    // m22 A_BLb A_BRbl A_BRbr
+    double *const A_TRl  = A_TR;
+    double *const A_TRr  = A_TR + *ldA * n21;
+    double *const A_BLt  = A_BL;
+    double *const A_BLb  = A_BL              + m21;
+    double *const A_BRtl = A_BR;
+    double *const A_BRtr = A_BR + *ldA * n21;
+    double *const A_BRbl = A_BR              + m21;
+    double *const A_BRbr = A_BR + *ldA * n21 + m21;
+
+    // recursion(Ab_L, ipiv_T)
+    RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
+
+    // Workl = A_BLb
+    LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
+
+    // partially redo swaps in A_L
+    for (i = 0; i < mn1; i++) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
+            else
+                BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
+        }
+    }
+
+    // apply pivots to A_Rl
+    LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
+
+    // apply pivots to A_Rr columnwise
+    for (j = 0; j < n22; j++) {
+        double *const A_Rrj = A_Rr + *ldA * j;
+        for (i = j; i < mn1; i++) {
+            const int ip = ipiv_T[i] - 1;
+            if (ip != i) {
+                const double tmp = A_Rrj[i];
+                A_Rrj[i] = A_Rr[ip];
+                A_Rrj[ip] = tmp;
+            }
+        }
+    }
+
+    // A_TRl = A_TL \ A_TRl
+    BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+    // Worku = A_TRr
+    LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
+    // Worku = A_TL \ Worku
+    BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
+    // A_TRr = Worku
+    LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
+    // A_BRtl = A_BRtl - A_BLt * A_TRl
+    BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
+    // A_BRbl = A_BRbl - Workl * A_TRl
+    BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
+    // A_BRtr = A_BRtr - A_BLt * Worku
+    BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
+    // A_BRbr = A_BRbr - Workl * Worku
+    BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
+
+    // partially undo swaps in A_L
+    for (i = mn1 - 1; i >= 0; i--) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
+            else
+                BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
+        }
+    }
+
+    // recursion(Ab_BR, ipiv_B)
+    RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+    if (*info)
+        *info += n1;
+    // shift pivots
+    for (i = 0; i < mn2; i++)
+        ipiv_B[i] += n1;
+}
--- a/relapack/src/dgemmt.c
+++ b/relapack/src/dgemmt.c
@ -0,0 +1,165 @@
+#include "relapack.h"
+
+static void RELAPACK_dgemmt_rec(const char *, const char *, const char *,
+    const int *, const int *, const double *, const double *, const int *,
+    const double *, const int *, const double *, double *, const int *);
+
+static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *,
+    const int *, const int *, const double *, const double *, const int *,
+    const double *, const int *, const double *, double *, const int *);
+
+
+/** DGEMMT computes a matrix-matrix product with general matrices but updates
+ * only the upper or lower triangular part of the result matrix.
+ *
+ * This routine performs the same operation as the BLAS routine
+ * dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
+ * but only updates the triangular part of C specified by uplo:
+ * If (*uplo == 'L'), only the lower triangular part of C is updated,
+ * otherwise the upper triangular part is updated.
+ * */
+void RELAPACK_dgemmt(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+
+#if HAVE_XGEMMT
+    BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+    return;
+#else
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int notransA = LAPACK(lsame)(transA, "N");
+    const int tranA = LAPACK(lsame)(transA, "T");
+    const int notransB = LAPACK(lsame)(transB, "N");
+    const int tranB = LAPACK(lsame)(transB, "T");
+    int info = 0;
+    if (!lower && !upper)
+        info = 1;
+    else if (!tranA && !notransA)
+        info = 2;
+    else if (!tranB && !notransB)
+        info = 3;
+    else if (*n < 0)
+        info = 4;
+    else if (*k < 0)
+        info = 5;
+    else if (*ldA < MAX(1, notransA ? *n : *k))
+        info = 8;
+    else if (*ldB < MAX(1, notransB ? *k : *n))
+        info = 10;
+    else if (*ldC < MAX(1, *n))
+        info = 13;
+    if (info) {
+        LAPACK(xerbla)("DGEMMT", &info);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+    const char cleantransA = notransA ? 'N' : 'T';
+    const char cleantransB = notransB ? 'N' : 'T';
+
+    // Recursive kernel
+    RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+#endif
+}
+
+
+/** dgemmt's recursive compute kernel */
+static void RELAPACK_dgemmt_rec(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+
+    if (*n <= MAX(CROSSOVER_DGEMMT, 1)) {
+        // Unblocked
+        RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+        return;
+    }
+
+    // Splitting
+    const int n1 = DREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_T
+    // A_B
+    const double *const A_T = A;
+    const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);
+
+    // B_L B_R
+    const double *const B_L = B;
+    const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);
+
+    // C_TL C_TR
+    // C_BL C_BR
+    double *const C_TL = C;
+    double *const C_TR = C + *ldC * n1;
+    double *const C_BL = C             + n1;
+    double *const C_BR = C + *ldC * n1 + n1;
+
+    // recursion(C_TL)
+    RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
+
+    if (*uplo == 'L')
+        // C_BL = alpha A_B B_L + beta C_BL
+        BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
+    else
+        // C_TR = alpha A_T B_R + beta C_TR
+        BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
+
+    // recursion(C_BR)
+    RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
+}
+
+
+/** dgemmt's unblocked compute kernel */
+static void RELAPACK_dgemmt_rec2(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+
+    const int incB = (*transB == 'N') ? 1 : *ldB;
+    const int incC = 1;
+
+    int i;
+    for (i = 0; i < *n; i++) {
+        // A_0
+        // A_i
+        const double *const A_0 = A;
+        const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i);
+
+        // * B_i *
+        const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i);
+
+        // * C_0i *
+        // * C_ii *
+        double *const C_0i = C + *ldC * i;
+        double *const C_ii = C + *ldC * i + i;
+
+        if (*uplo == 'L') {
+            const int nmi = *n - i;
+            if (*transA == 'N')
+                BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+            else
+                BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+        } else {
+            const int ip1 = i + 1;
+            if (*transA == 'N')
+                BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+            else
+                BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+        }
+    }
+}
--- a/relapack/src/dgetrf.c
+++ b/relapack/src/dgetrf.c
@ -0,0 +1,117 @@
+#include "relapack.h"
+
+static void RELAPACK_dgetrf_rec(const int *, const int *, double *,
+    const int *, int *, int *);
+
+
+/** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's dgetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html
+ * */
+void RELAPACK_dgetrf(
+    const int *m, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DGETRF", &minfo);
+        return;
+    }
+
+    const int sn = MIN(*m, *n);
+
+    RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info);
+
+    // Right remainder
+    if (*m < *n) {
+        // Constants
+        const double ONE[] = { 1. };
+        const int   iONE[] = { 1. };
+
+        // Splitting
+        const int rn = *n - *m;
+
+        // A_L A_R
+        const double *const A_L = A;
+        double *const       A_R = A + *ldA * *m;
+
+        // A_R = apply(ipiv, A_R)
+        LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
+        // A_R = A_S \ A_R
+        BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
+    }
+}
+
+
+/** dgetrf's recursive compute kernel */
+static void RELAPACK_dgetrf_rec(
+    const int *m, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_DGETRF, 1)) {
+        // Unblocked
+        LAPACK(dgetf2)(m, n, A, ldA, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+    const int    iONE[] = { 1 };
+
+    // Splitting
+    const int n1 = DREC_SPLIT(*n);
+    const int n2 = *n - n1;
+    const int m2 = *m - n1;
+
+    // A_L A_R
+    double *const A_L = A;
+    double *const A_R = A + *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + n1;
+    double *const A_BR = A + *ldA * n1 + n1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // recursion(A_L, ipiv_T)
+    RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
+    // apply pivots to A_R
+    LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
+
+    // A_TR = A_TL \ A_TR
+    BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+    // A_BR = A_BR - A_BL * A_TR
+    BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
+
+    // recursion(A_BR, ipiv_B)
+    RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
+    if (*info)
+        *info += n1;
+    // apply pivots to A_BL
+    LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
+    // shift pivots
+    int i;
+    for (i = 0; i < n2; i++)
+        ipiv_B[i] += n1;
+}
--- a/relapack/src/dlauum.c
+++ b/relapack/src/dlauum.c
@ -0,0 +1,87 @@
+#include "relapack.h"
+
+static void RELAPACK_dlauum_rec(const char *, const int *, double *,
+    const int *, int *);
+
+
+/** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
+ *
+ * This routine is functionally equivalent to LAPACK's dlauum.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html
+ * */
+void RELAPACK_dlauum(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DLAUUM", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** dlauum's recursive compute kernel */
+static void RELAPACK_dlauum_rec(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_DLAUUM, 1)) {
+        // Unblocked
+        LAPACK(dlauu2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[] = { 1. };
+
+    // Splitting
+    const int n1 = DREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + n1;
+    double *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info);
+
+    if (*uplo == 'L') {
+        // A_TL = A_TL + A_BL' * A_BL
+        BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
+        // A_BL = A_BR' * A_BL
+        BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TL = A_TL + A_TR * A_TR'
+        BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
+        // A_TR = A_TR * A_BR'
+        BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info);
+}
--- a/relapack/src/dpbtrf.c
+++ b/relapack/src/dpbtrf.c
@ -0,0 +1,157 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *,
+    double *, const int *, double *, const int *, int *);
+
+
+/** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's dpbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html
+ * */
+void RELAPACK_dpbtrf(
+    const char *uplo, const int *n, const int *kd,
+    double *Ab, const int *ldAb,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kd < 0)
+        *info = -3;
+    else if (*ldAb < *kd + 1)
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DPBTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Constant
+    const double ZERO[] = { 0. };
+
+    // Allocate work space
+    const int n1 = DREC_SPLIT(*n);
+    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+    double *Work = malloc(mWork * nWork * sizeof(double));
+    LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
+
+    // Recursive kernel
+    RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
+
+    // Free work space
+    free(Work);
+}
+
+
+/** dpbtrf's recursive compute kernel */
+static void RELAPACK_dpbtrf_rec(
+    const char *uplo, const int *n, const int *kd,
+    double *Ab, const int *ldAb,
+    double *Work, const int *ldWork,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_DPBTRF, 1)) {
+        // Unblocked
+        LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    double *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
+
+    // Splitting
+    const int n1 = MIN(DREC_SPLIT(*n), *kd);
+    const int n2 = *n - n1;
+
+    // * *
+    // * Ab_BR
+    double *const Ab_BR = Ab + *ldAb * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + n1;
+    double *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    // Banded splitting
+    const int n21 = MIN(n2, *kd - n1);
+    const int n22 = MIN(n2 - n21, n1);
+
+    //     n1    n21    n22
+    // n1  *     A_TRl  A_TRr
+    // n21 A_BLt A_BRtl A_BRtr
+    // n22 A_BLb A_BRbl A_BRbr
+    double *const A_TRl  = A_TR;
+    double *const A_TRr  = A_TR + *ldA * n21;
+    double *const A_BLt  = A_BL;
+    double *const A_BLb  = A_BL               + n21;
+    double *const A_BRtl = A_BR;
+    double *const A_BRtr = A_BR + *ldA * n21;
+    double *const A_BRbl = A_BR               + n21;
+    double *const A_BRbr = A_BR + *ldA * n21  + n21;
+
+    if (*uplo == 'L') {
+        // A_BLt = ABLt / A_TL'
+        BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
+        // A_BRtl = A_BRtl - A_BLt * A_BLt'
+        BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
+        // Work = A_BLb
+        LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
+        // Work = Work / A_TL'
+        BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRbl = A_BRbl - Work * A_BLt'
+        BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
+        // A_BRbr = A_BRbr - Work * Work'
+        BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_BLb = Work
+        LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
+    } else {
+        // A_TRl = A_TL' \ A_TRl
+        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+        // A_BRtl = A_BRtl - A_TRl' * A_TRl
+        BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
+        // Work = A_TRr
+        LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
+        // Work = A_TL' \ Work
+        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRtr = A_BRtr - A_TRl' * Work
+        BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
+        // A_BRbr = A_BRbr - Work' * Work
+        BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_TRr = Work
+        LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
+    }
+
+    // recursion(A_BR)
+    if (*kd > n1)
+        RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info);
+    else
+        RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
+    if (*info)
+        *info += n1;
+}
--- a/relapack/src/dpotrf.c
+++ b/relapack/src/dpotrf.c
@ -0,0 +1,92 @@
+#include "relapack.h"
+
+static void RELAPACK_dpotrf_rec(const char *, const int *, double *,
+        const int *, int *);
+
+
+/** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's dpotrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
+ * */
+void RELAPACK_dpotrf(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DPOTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** dpotrf's recursive compute kernel */
+static void RELAPACK_dpotrf_rec(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_DPOTRF, 1)) {
+        // Unblocked
+        LAPACK(dpotf2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+
+    // Splitting
+    const int n1 = DREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + n1;
+    double *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = A_BL / A_TL'
+        BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
+        // A_BR = A_BR - A_BL * A_BL'
+        BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
+    } else {
+        // A_TR = A_TL' \ A_TR
+        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+        // A_BR = A_BR - A_TR' * A_TR
+        BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}
--- a/Show More
+++ b/Show More