From 96dd0ef4f71da18324864e979133e873aa66306a Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 14:25:10 +1000
Subject: [PATCH 01/42] Optimise ccopy for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/ccopy_microk_power8.c | 128 ++++++++++++++---------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c
index b2b1bead1..613c4d286 100644
--- a/kernel/power/ccopy_microk_power8.c
+++ b/kernel/power/ccopy_microk_power8.c
@@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
 {
   __asm__
     (
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %5, %2	\n\t"
-       "lxvw4x		34, %6, %2	\n\t"
-       "lxvw4x		35, %7, %2	\n\t"
-       "lxvw4x		36, %8, %2	\n\t"
-       "lxvw4x		37, %9, %2	\n\t"
-       "lxvw4x		38, %10, %2	\n\t"
-       "lxvw4x		39, %11, %2	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %5, %2	\n\t"
+       "lxvd2x		34, %6, %2	\n\t"
+       "lxvd2x		35, %7, %2	\n\t"
+       "lxvd2x		36, %8, %2	\n\t"
+       "lxvd2x		37, %9, %2	\n\t"
+       "lxvd2x		38, %10, %2	\n\t"
+       "lxvd2x		39, %11, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
@@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
        ".p2align	5		\n"
      "1:				\n\t"
 
-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %5, %2	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "lxvw4x		34, %6, %2	\n\t"
-       "lxvw4x		35, %7, %2	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "lxvw4x		36, %8, %2	\n\t"
-       "lxvw4x		37, %9, %2	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
-       "lxvw4x		38, %10, %2	\n\t"
-       "lxvw4x		39, %11, %2	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %5, %2	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "lxvd2x		34, %6, %2	\n\t"
+       "lxvd2x		35, %7, %2	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "lxvd2x		36, %8, %2	\n\t"
+       "lxvd2x		37, %9, %2	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
+       "lxvd2x		38, %10, %2	\n\t"
+       "lxvd2x		39, %11, %2	\n\t"
 
        "addi		%3, %3, 128	\n\t"
        "addi		%2, %2, 128	\n\t"
 
-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
 
        "addi		%3, %3, 128	\n\t"
        "addi		%2, %2, 128	\n\t"
@@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
 
      "2:				\n\t"
 
-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
 
        "addi		%3, %3, 128	\n\t"
 
-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n"
 
      "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
      :

From be55f96cbdc919c7c7da2da2f7a2c6c47336a9f6 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 14:58:00 +1000
Subject: [PATCH 02/42] Optimise scopy for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/scopy_microk_power8.c | 64 +++++++++++++++---------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c
index 444a6d4d5..7a54d5e1e 100644
--- a/kernel/power/scopy_microk_power8.c
+++ b/kernel/power/scopy_microk_power8.c
@@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
 {
   __asm__
     (
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
@@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
        ".p2align	5		\n"
      "1:				\n\t"
 
-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %5, %2	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "lxvw4x		42, %6, %2	\n\t"
-       "lxvw4x		43, %7, %2	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "lxvw4x		44, %8, %2	\n\t"
-       "lxvw4x		45, %9, %2	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n\t"
-       "lxvw4x		46, %10, %2	\n\t"
-       "lxvw4x		47, %11, %2	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
 
        "addi		%3, %3, 128	\n\t"
        "addi		%2, %2, 128	\n\t"
@@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
 
      "2:				\n\t"
 
-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n"
 
      "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
      :

From 6f4eca5ea4ab00726199277bb7a079900d20d388 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:23:20 +1000
Subject: [PATCH 03/42] Optimise sswap for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/sswap_microk_power8.c | 64 +++++++++++++++---------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c
index d44f16765..cfefdd6ef 100644
--- a/kernel/power/sswap_microk_power8.c
+++ b/kernel/power/sswap_microk_power8.c
@@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y)
        ".p2align	5		\n"
      "1:				\n\t"
 
-       "lxvw4x		32, 0, %4	\n\t"
-       "lxvw4x		33, %5, %4	\n\t"
-       "lxvw4x		34, %6, %4	\n\t"
-       "lxvw4x		35, %7, %4	\n\t"
-       "lxvw4x		36, %8, %4	\n\t"
-       "lxvw4x		37, %9, %4	\n\t"
-       "lxvw4x		38, %10, %4	\n\t"
-       "lxvw4x		39, %11, %4	\n\t"
+       "lxvd2x		32, 0, %4	\n\t"
+       "lxvd2x		33, %5, %4	\n\t"
+       "lxvd2x		34, %6, %4	\n\t"
+       "lxvd2x		35, %7, %4	\n\t"
+       "lxvd2x		36, %8, %4	\n\t"
+       "lxvd2x		37, %9, %4	\n\t"
+       "lxvd2x		38, %10, %4	\n\t"
+       "lxvd2x		39, %11, %4	\n\t"
 
-       "lxvw4x		40, 0, %3	\n\t"
-       "lxvw4x		41, %5, %3	\n\t"
-       "lxvw4x		42, %6, %3	\n\t"
-       "lxvw4x		43, %7, %3	\n\t"
-       "lxvw4x		44, %8, %3	\n\t"
-       "lxvw4x		45, %9, %3	\n\t"
-       "lxvw4x		46, %10, %3	\n\t"
-       "lxvw4x		47, %11, %3	\n\t"
+       "lxvd2x		40, 0, %3	\n\t"
+       "lxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		42, %6, %3	\n\t"
+       "lxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		44, %8, %3	\n\t"
+       "lxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		46, %10, %3	\n\t"
+       "lxvd2x		47, %11, %3	\n\t"
 
-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
 
        "addi		%3, %3, 128	\n\t"
 
-       "stxvw4x		40, 0, %4	\n\t"
-       "stxvw4x		41, %5, %4	\n\t"
-       "stxvw4x		42, %6, %4	\n\t"
-       "stxvw4x		43, %7, %4	\n\t"
-       "stxvw4x		44, %8, %4	\n\t"
-       "stxvw4x		45, %9, %4	\n\t"
-       "stxvw4x		46, %10, %4	\n\t"
-       "stxvw4x		47, %11, %4	\n\t"
+       "stxvd2x		40, 0, %4	\n\t"
+       "stxvd2x		41, %5, %4	\n\t"
+       "stxvd2x		42, %6, %4	\n\t"
+       "stxvd2x		43, %7, %4	\n\t"
+       "stxvd2x		44, %8, %4	\n\t"
+       "stxvd2x		45, %9, %4	\n\t"
+       "stxvd2x		46, %10, %4	\n\t"
+       "stxvd2x		47, %11, %4	\n\t"
 
        "addi		%4, %4, 128	\n\t"
 

From 4f09030fdc36444709cf3af9041a8043f1f6d83d Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:36:10 +1000
Subject: [PATCH 04/42] Optimise cswap for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/cswap_microk_power8.c | 128 ++++++++++++++---------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c
index 1dd03dc88..8d7d0c0b9 100644
--- a/kernel/power/cswap_microk_power8.c
+++ b/kernel/power/cswap_microk_power8.c
@@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y)
        ".p2align	5		\n"
      "1:				\n\t"
 
-       "lxvw4x		32, 0, %4	\n\t"
-       "lxvw4x		33, %5, %4	\n\t"
-       "lxvw4x		34, %6, %4	\n\t"
-       "lxvw4x		35, %7, %4	\n\t"
-       "lxvw4x		36, %8, %4	\n\t"
-       "lxvw4x		37, %9, %4	\n\t"
-       "lxvw4x		38, %10, %4	\n\t"
-       "lxvw4x		39, %11, %4	\n\t"
+       "lxvd2x		32, 0, %4	\n\t"
+       "lxvd2x		33, %5, %4	\n\t"
+       "lxvd2x		34, %6, %4	\n\t"
+       "lxvd2x		35, %7, %4	\n\t"
+       "lxvd2x		36, %8, %4	\n\t"
+       "lxvd2x		37, %9, %4	\n\t"
+       "lxvd2x		38, %10, %4	\n\t"
+       "lxvd2x		39, %11, %4	\n\t"
 
        "addi		%4, %4, 128	\n\t"
 
-       "lxvw4x		40, 0, %4	\n\t"
-       "lxvw4x		41, %5, %4	\n\t"
-       "lxvw4x		42, %6, %4	\n\t"
-       "lxvw4x		43, %7, %4	\n\t"
-       "lxvw4x		44, %8, %4	\n\t"
-       "lxvw4x		45, %9, %4	\n\t"
-       "lxvw4x		46, %10, %4	\n\t"
-       "lxvw4x		47, %11, %4	\n\t"
+       "lxvd2x		40, 0, %4	\n\t"
+       "lxvd2x		41, %5, %4	\n\t"
+       "lxvd2x		42, %6, %4	\n\t"
+       "lxvd2x		43, %7, %4	\n\t"
+       "lxvd2x		44, %8, %4	\n\t"
+       "lxvd2x		45, %9, %4	\n\t"
+       "lxvd2x		46, %10, %4	\n\t"
+       "lxvd2x		47, %11, %4	\n\t"
 
        "addi		%4, %4, -128	\n\t"
 
-       "lxvw4x		48, 0, %3	\n\t"
-       "lxvw4x		49, %5, %3	\n\t"
-       "lxvw4x		50, %6, %3	\n\t"
-       "lxvw4x		51, %7, %3	\n\t"
-       "lxvw4x		0, %8, %3	\n\t"
-       "lxvw4x		1, %9, %3	\n\t"
-       "lxvw4x		2, %10, %3	\n\t"
-       "lxvw4x		3, %11, %3	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
+       "lxvd2x		49, %5, %3	\n\t"
+       "lxvd2x		50, %6, %3	\n\t"
+       "lxvd2x		51, %7, %3	\n\t"
+       "lxvd2x		0, %8, %3	\n\t"
+       "lxvd2x		1, %9, %3	\n\t"
+       "lxvd2x		2, %10, %3	\n\t"
+       "lxvd2x		3, %11, %3	\n\t"
 
        "addi		%3, %3, 128	\n\t"
 
-       "lxvw4x		4, 0, %3	\n\t"
-       "lxvw4x		5, %5, %3	\n\t"
-       "lxvw4x		6, %6, %3	\n\t"
-       "lxvw4x		7, %7, %3	\n\t"
-       "lxvw4x		8, %8, %3	\n\t"
-       "lxvw4x		9, %9, %3	\n\t"
-       "lxvw4x		10, %10, %3	\n\t"
-       "lxvw4x		11, %11, %3	\n\t"
+       "lxvd2x		4, 0, %3	\n\t"
+       "lxvd2x		5, %5, %3	\n\t"
+       "lxvd2x		6, %6, %3	\n\t"
+       "lxvd2x		7, %7, %3	\n\t"
+       "lxvd2x		8, %8, %3	\n\t"
+       "lxvd2x		9, %9, %3	\n\t"
+       "lxvd2x		10, %10, %3	\n\t"
+       "lxvd2x		11, %11, %3	\n\t"
 
        "addi		%3, %3, -128	\n\t"
 
-       "stxvw4x		32, 0, %3	\n\t"
-       "stxvw4x		33, %5, %3	\n\t"
-       "stxvw4x		34, %6, %3	\n\t"
-       "stxvw4x		35, %7, %3	\n\t"
-       "stxvw4x		36, %8, %3	\n\t"
-       "stxvw4x		37, %9, %3	\n\t"
-       "stxvw4x		38, %10, %3	\n\t"
-       "stxvw4x		39, %11, %3	\n\t"
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
 
        "addi		%3, %3, 128	\n\t"
 
-       "stxvw4x		40, 0, %3	\n\t"
-       "stxvw4x		41, %5, %3	\n\t"
-       "stxvw4x		42, %6, %3	\n\t"
-       "stxvw4x		43, %7, %3	\n\t"
-       "stxvw4x		44, %8, %3	\n\t"
-       "stxvw4x		45, %9, %3	\n\t"
-       "stxvw4x		46, %10, %3	\n\t"
-       "stxvw4x		47, %11, %3	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
 
        "addi		%3, %3, 128	\n\t"
 
-       "stxvw4x		48, 0, %4	\n\t"
-       "stxvw4x		49, %5, %4	\n\t"
-       "stxvw4x		50, %6, %4	\n\t"
-       "stxvw4x		51, %7, %4	\n\t"
-       "stxvw4x		0, %8, %4	\n\t"
-       "stxvw4x		1, %9, %4	\n\t"
-       "stxvw4x		2, %10, %4	\n\t"
-       "stxvw4x		3, %11, %4	\n\t"
+       "stxvd2x		48, 0, %4	\n\t"
+       "stxvd2x		49, %5, %4	\n\t"
+       "stxvd2x		50, %6, %4	\n\t"
+       "stxvd2x		51, %7, %4	\n\t"
+       "stxvd2x		0, %8, %4	\n\t"
+       "stxvd2x		1, %9, %4	\n\t"
+       "stxvd2x		2, %10, %4	\n\t"
+       "stxvd2x		3, %11, %4	\n\t"
 
        "addi		%4, %4, 128	\n\t"
 
-       "stxvw4x		4, 0, %4	\n\t"
-       "stxvw4x		5, %5, %4	\n\t"
-       "stxvw4x		6, %6, %4	\n\t"
-       "stxvw4x		7, %7, %4	\n\t"
-       "stxvw4x		8, %8, %4	\n\t"
-       "stxvw4x		9, %9, %4	\n\t"
-       "stxvw4x		10, %10, %4	\n\t"
-       "stxvw4x		11, %11, %4	\n\t"
+       "stxvd2x		4, 0, %4	\n\t"
+       "stxvd2x		5, %5, %4	\n\t"
+       "stxvd2x		6, %6, %4	\n\t"
+       "stxvd2x		7, %7, %4	\n\t"
+       "stxvd2x		8, %8, %4	\n\t"
+       "stxvd2x		9, %9, %4	\n\t"
+       "stxvd2x		10, %10, %4	\n\t"
+       "stxvd2x		11, %11, %4	\n\t"
 
        "addi		%4, %4, 128	\n\t"
 

From 19bdf9d52b222a4edd3e1710023af8c40f84c255 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:38:32 +1000
Subject: [PATCH 05/42] Optimise casum for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/casum_microk_power8.c | 32 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
index 93ba50660..7d12c9885 100644
--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
@@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x)
        "xxlxor		38, 38,	38	\n\t"
        "xxlxor		39, 39,	39	\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
@@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x)
        "xvabssp		50, 42		\n\t"
        "xvabssp		51, 43		\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
 
        "xvabssp		%x3, 44		\n\t"
        "xvabssp		%x4, 45		\n\t"
 
-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
 
        "xvabssp		%x5, 46		\n\t"
        "xvabssp		%x6, 47		\n\t"
 
-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
 
        "xvaddsp		32, 32, 48	\n\t"
        "xvaddsp		33, 33, 49	\n\t"
 
-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
 
        "xvaddsp		34, 34, 50	\n\t"
        "xvaddsp		35, 35, 51	\n\t"

From 32c7fe6bff6f04d61e6a09d10199a14e63e77083 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:39:27 +1000
Subject: [PATCH 06/42] Optimise sasum for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/sasum_microk_power8.c | 32 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
index 08a766f80..4bb515de8 100644
--- a/kernel/power/sasum_microk_power8.c
+++ b/kernel/power/sasum_microk_power8.c
@@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
        "xxlxor		38, 38,	38	\n\t"
        "xxlxor		39, 39,	39	\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
@@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
        "xvabssp		50, 42		\n\t"
        "xvabssp		51, 43		\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		41, %8, %2	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
 
        "xvabssp		%x3, 44		\n\t"
        "xvabssp		%x4, 45		\n\t"
 
-       "lxvw4x		42, %9, %2	\n\t"
-       "lxvw4x		43, %10, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
 
        "xvabssp		%x5, 46		\n\t"
        "xvabssp		%x6, 47		\n\t"
 
-       "lxvw4x		44, %11, %2	\n\t"
-       "lxvw4x		45, %12, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
 
        "xvaddsp		32, 32, 48	\n\t"
        "xvaddsp		33, 33, 49	\n\t"
 
-       "lxvw4x		46, %13, %2	\n\t"
-       "lxvw4x		47, %14, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
 
        "xvaddsp		34, 34, 50	\n\t"
        "xvaddsp		35, 35, 51	\n\t"

From e0034de22d9a789988e29e3b67a796cee0c97965 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:43:31 +1000
Subject: [PATCH 07/42] Optimise sdot for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/sdot_microk_power8.c | 64 +++++++++++++++----------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c
index 7f7ccfac3..bfe100c8b 100644
--- a/kernel/power/sdot_microk_power8.c
+++ b/kernel/power/sdot_microk_power8.c
@@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y)
        "xxlxor		38, 38,	38	\n\t"
        "xxlxor		39, 39,	39	\n\t"
 
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		48, 0, %3	\n\t"
-       "lxvw4x		41, %10, %2	\n\t"
-       "lxvw4x		49, %10, %3	\n\t"
-       "lxvw4x		42, %11, %2	\n\t"
-       "lxvw4x		50, %11, %3	\n\t"
-       "lxvw4x		43, %12, %2	\n\t"
-       "lxvw4x		51, %12, %3	\n\t"
-       "lxvw4x		44, %13, %2	\n\t"
-       "lxvw4x		%x4, %13, %3	\n\t"
-       "lxvw4x		45, %14, %2	\n\t"
-       "lxvw4x		%x5, %14, %3	\n\t"
-       "lxvw4x		46, %15, %2	\n\t"
-       "lxvw4x		%x6, %15, %3	\n\t"
-       "lxvw4x		47, %16, %2	\n\t"
-       "lxvw4x		%x7, %16, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
+       "lxvd2x		41, %10, %2	\n\t"
+       "lxvd2x		49, %10, %3	\n\t"
+       "lxvd2x		42, %11, %2	\n\t"
+       "lxvd2x		50, %11, %3	\n\t"
+       "lxvd2x		43, %12, %2	\n\t"
+       "lxvd2x		51, %12, %3	\n\t"
+       "lxvd2x		44, %13, %2	\n\t"
+       "lxvd2x		%x4, %13, %3	\n\t"
+       "lxvd2x		45, %14, %2	\n\t"
+       "lxvd2x		%x5, %14, %3	\n\t"
+       "lxvd2x		46, %15, %2	\n\t"
+       "lxvd2x		%x6, %15, %3	\n\t"
+       "lxvd2x		47, %16, %2	\n\t"
+       "lxvd2x		%x7, %16, %3	\n\t"
 
        "addi		%2, %2, 128	\n\t"
        "addi		%3, %3, 128	\n\t"
@@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y)
      "1:				\n\t"
 
        "xvmaddasp	32, 40, 48	\n\t"
-       "lxvw4x		40, 0, %2	\n\t"
-       "lxvw4x		48, 0, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
        "xvmaddasp	33, 41, 49	\n\t"
-       "lxvw4x		41, %10, %2	\n\t"
-       "lxvw4x		49, %10, %3	\n\t"
+       "lxvd2x		41, %10, %2	\n\t"
+       "lxvd2x		49, %10, %3	\n\t"
        "xvmaddasp	34, 42, 50	\n\t"
-       "lxvw4x		42, %11, %2	\n\t"
-       "lxvw4x		50, %11, %3	\n\t"
+       "lxvd2x		42, %11, %2	\n\t"
+       "lxvd2x		50, %11, %3	\n\t"
        "xvmaddasp	35, 43, 51	\n\t"
-       "lxvw4x		43, %12, %2	\n\t"
-       "lxvw4x		51, %12, %3	\n\t"
+       "lxvd2x		43, %12, %2	\n\t"
+       "lxvd2x		51, %12, %3	\n\t"
        "xvmaddasp	36, 44, %x4	\n\t"
-       "lxvw4x		44, %13, %2	\n\t"
-       "lxvw4x		%x4, %13, %3	\n\t"
+       "lxvd2x		44, %13, %2	\n\t"
+       "lxvd2x		%x4, %13, %3	\n\t"
        "xvmaddasp	37, 45, %x5	\n\t"
-       "lxvw4x		45, %14, %2	\n\t"
-       "lxvw4x		%x5, %14, %3	\n\t"
+       "lxvd2x		45, %14, %2	\n\t"
+       "lxvd2x		%x5, %14, %3	\n\t"
        "xvmaddasp	38, 46, %x6	\n\t"
-       "lxvw4x		46, %15, %2	\n\t"
-       "lxvw4x		%x6, %15, %3	\n\t"
+       "lxvd2x		46, %15, %2	\n\t"
+       "lxvd2x		%x6, %15, %3	\n\t"
        "xvmaddasp	39, 47, %x7	\n\t"
-       "lxvw4x		47, %16, %2	\n\t"
-       "lxvw4x		%x7, %16, %3	\n\t"
+       "lxvd2x		47, %16, %2	\n\t"
+       "lxvd2x		%x7, %16, %3	\n\t"
 
        "addi		%2, %2, 128	\n\t"
        "addi		%3, %3, 128	\n\t"

From edc97918f8e45e6e922d0e221cf103a4c736ca61 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:45:58 +1000
Subject: [PATCH 08/42] Optimise srot for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/srot_microk_power8.c | 64 +++++++++++++++----------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c
index 0a18c16e0..6eecb60a1 100644
--- a/kernel/power/srot_microk_power8.c
+++ b/kernel/power/srot_microk_power8.c
@@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
        "xscvdpspn	37, %x14	\n\t"	// load s to all words
        "xxspltw		37, 37, 0	\n\t"
 
-       "lxvw4x		32, 0, %3	\n\t"	// load x
-       "lxvw4x		33, %15, %3	\n\t"
-       "lxvw4x		34, %16, %3	\n\t"
-       "lxvw4x		35, %17, %3	\n\t"
+       "lxvd2x		32, 0, %3	\n\t"	// load x
+       "lxvd2x		33, %15, %3	\n\t"
+       "lxvd2x		34, %16, %3	\n\t"
+       "lxvd2x		35, %17, %3	\n\t"
 
-       "lxvw4x		48, 0, %4	\n\t"	// load y
-       "lxvw4x		49, %15, %4	\n\t"
-       "lxvw4x		50, %16, %4	\n\t"
-       "lxvw4x		51, %17, %4	\n\t"
+       "lxvd2x		48, 0, %4	\n\t"	// load y
+       "lxvd2x		49, %15, %4	\n\t"
+       "lxvd2x		50, %16, %4	\n\t"
+       "lxvd2x		51, %17, %4	\n\t"
 
        "addi		%3, %3, 64	\n\t"
        "addi		%4, %4, 64	\n\t"
@@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
        "xvmulsp		44, 32, 37	\n\t"	// s * x
        "xvmulsp		45, 33, 37	\n\t"
 
-       "lxvw4x		32, 0, %3	\n\t"	// load x
-       "lxvw4x		33, %15, %3	\n\t"
+       "lxvd2x		32, 0, %3	\n\t"	// load x
+       "lxvd2x		33, %15, %3	\n\t"
 
        "xvmulsp		46, 34, 37	\n\t"
        "xvmulsp		47, 35, 37	\n\t"
 
-       "lxvw4x		34, %16, %3	\n\t"
-       "lxvw4x		35, %17, %3	\n\t"
+       "lxvd2x		34, %16, %3	\n\t"
+       "lxvd2x		35, %17, %3	\n\t"
 
        "xvmulsp		%x9, 48, 37	\n\t"	// s * y
        "xvmulsp		%x10, 49, 37	\n\t"
 
-       "lxvw4x		48, 0, %4	\n\t"	// load y
-       "lxvw4x		49, %15, %4	\n\t"
+       "lxvd2x		48, 0, %4	\n\t"	// load y
+       "lxvd2x		49, %15, %4	\n\t"
 
        "xvmulsp		%x11, 50, 37	\n\t"
        "xvmulsp		%x12, 51, 37	\n\t"
 
-       "lxvw4x		50, %16, %4	\n\t"
-       "lxvw4x		51, %17, %4	\n\t"
+       "lxvd2x		50, %16, %4	\n\t"
+       "lxvd2x		51, %17, %4	\n\t"
 
        "xvaddsp		40, 40, %x9	\n\t"	// c * x + s * y
        "xvaddsp		41, 41, %x10	\n\t"	// c * x + s * y
@@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
        "xvsubsp		%x7, %x7, 46	\n\t"	// c * y - s * x
        "xvsubsp		%x8, %x8, 47	\n\t"	// c * y - s * x
 
-       "stxvw4x		40, 0, %3	\n\t"	// store x
-       "stxvw4x		41, %15, %3	\n\t"
-       "stxvw4x		42, %16, %3	\n\t"
-       "stxvw4x		43, %17, %3	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"	// store x
+       "stxvd2x		41, %15, %3	\n\t"
+       "stxvd2x		42, %16, %3	\n\t"
+       "stxvd2x		43, %17, %3	\n\t"
 
-       "stxvw4x		%x5, 0, %4	\n\t"	// store y
-       "stxvw4x		%x6, %15, %4	\n\t"
-       "stxvw4x		%x7, %16, %4	\n\t"
-       "stxvw4x		%x8, %17, %4	\n\t"
+       "stxvd2x		%x5, 0, %4	\n\t"	// store y
+       "stxvd2x		%x6, %15, %4	\n\t"
+       "stxvd2x		%x7, %16, %4	\n\t"
+       "stxvd2x		%x8, %17, %4	\n\t"
 
        "addi		%3, %3, 128	\n\t"
        "addi		%4, %4, 128	\n\t"
@@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
        "xvsubsp		%x7, %x7, 46	\n\t"	// c * y - s * x
        "xvsubsp		%x8, %x8, 47	\n\t"	// c * y - s * x
 
-       "stxvw4x		40, 0, %3	\n\t"	// store x
-       "stxvw4x		41, %15, %3	\n\t"
-       "stxvw4x		42, %16, %3	\n\t"
-       "stxvw4x		43, %17, %3	\n\t"
+       "stxvd2x		40, 0, %3	\n\t"	// store x
+       "stxvd2x		41, %15, %3	\n\t"
+       "stxvd2x		42, %16, %3	\n\t"
+       "stxvd2x		43, %17, %3	\n\t"
 
-       "stxvw4x		%x5, 0, %4	\n\t"	// store y
-       "stxvw4x		%x6, %15, %4	\n\t"
-       "stxvw4x		%x7, %16, %4	\n\t"
-       "stxvw4x		%x8, %17, %4	\n"
+       "stxvd2x		%x5, 0, %4	\n\t"	// store y
+       "stxvd2x		%x6, %15, %4	\n\t"
+       "stxvd2x		%x7, %16, %4	\n\t"
+       "stxvd2x		%x8, %17, %4	\n"
 
      "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
      "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"

From bd831a03a80d642693c786f7a65265ad40a50fc0 Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Wed, 14 Jun 2017 16:47:56 +1000
Subject: [PATCH 09/42] Optimise sscal for POWER9

Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
---
 kernel/power/sscal_microk_power8.c | 80 +++++++++++++++---------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c
index 49862a329..058ff3399 100644
--- a/kernel/power/sscal_microk_power8.c
+++ b/kernel/power/sscal_microk_power8.c
@@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
        "xscvdpspn	%x3, %x3	\n\t"
        "xxspltw		%x3, %x3, 0	\n\t"
 
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %4, %2	\n\t"
-       "lxvw4x		34, %5, %2	\n\t"
-       "lxvw4x		35, %6, %2	\n\t"
-       "lxvw4x		36, %7, %2	\n\t"
-       "lxvw4x		37, %8, %2	\n\t"
-       "lxvw4x		38, %9, %2	\n\t"
-       "lxvw4x		39, %10, %2	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %4, %2	\n\t"
+       "lxvd2x		34, %5, %2	\n\t"
+       "lxvd2x		35, %6, %2	\n\t"
+       "lxvd2x		36, %7, %2	\n\t"
+       "lxvd2x		37, %8, %2	\n\t"
+       "lxvd2x		38, %9, %2	\n\t"
+       "lxvd2x		39, %10, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 
@@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
 
        "xvmulsp		40, 32, %x3	\n\t"
        "xvmulsp		41, 33, %x3	\n\t"
-       "lxvw4x		32, 0, %2	\n\t"
-       "lxvw4x		33, %4, %2	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %4, %2	\n\t"
        "xvmulsp		42, 34, %x3	\n\t"
        "xvmulsp		43, 35, %x3	\n\t"
-       "lxvw4x		34, %5, %2	\n\t"
-       "lxvw4x		35, %6, %2	\n\t"
+       "lxvd2x		34, %5, %2	\n\t"
+       "lxvd2x		35, %6, %2	\n\t"
        "xvmulsp		44, 36, %x3	\n\t"
        "xvmulsp		45, 37, %x3	\n\t"
-       "lxvw4x		36, %7, %2	\n\t"
-       "lxvw4x		37, %8, %2	\n\t"
+       "lxvd2x		36, %7, %2	\n\t"
+       "lxvd2x		37, %8, %2	\n\t"
        "xvmulsp		46, 38, %x3	\n\t"
        "xvmulsp		47, 39, %x3	\n\t"
-       "lxvw4x		38, %9, %2	\n\t"
-       "lxvw4x		39, %10, %2	\n\t"
+       "lxvd2x		38, %9, %2	\n\t"
+       "lxvd2x		39, %10, %2	\n\t"
 
        "addi		%2, %2, -128	\n\t"
 
-       "stxvw4x		40, 0, %2	\n\t"
-       "stxvw4x		41, %4, %2	\n\t"
-       "stxvw4x		42, %5, %2	\n\t"
-       "stxvw4x		43, %6, %2	\n\t"
-       "stxvw4x		44, %7, %2	\n\t"
-       "stxvw4x		45, %8, %2	\n\t"
-       "stxvw4x		46, %9, %2	\n\t"
-       "stxvw4x		47, %10, %2	\n\t"
+       "stxvd2x		40, 0, %2	\n\t"
+       "stxvd2x		41, %4, %2	\n\t"
+       "stxvd2x		42, %5, %2	\n\t"
+       "stxvd2x		43, %6, %2	\n\t"
+       "stxvd2x		44, %7, %2	\n\t"
+       "stxvd2x		45, %8, %2	\n\t"
+       "stxvd2x		46, %9, %2	\n\t"
+       "stxvd2x		47, %10, %2	\n\t"
 
        "addi		%2, %2, 256	\n\t"
 
@@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
        "xvmulsp		46, 38, %x3	\n\t"
        "xvmulsp		47, 39, %x3	\n\t"
 
-       "stxvw4x		40, 0, %2	\n\t"
-       "stxvw4x		41, %4, %2	\n\t"
-       "stxvw4x		42, %5, %2	\n\t"
-       "stxvw4x		43, %6, %2	\n\t"
-       "stxvw4x		44, %7, %2	\n\t"
-       "stxvw4x		45, %8, %2	\n\t"
-       "stxvw4x		46, %9, %2	\n\t"
-       "stxvw4x		47, %10, %2	\n"
+       "stxvd2x		40, 0, %2	\n\t"
+       "stxvd2x		41, %4, %2	\n\t"
+       "stxvd2x		42, %5, %2	\n\t"
+       "stxvd2x		43, %6, %2	\n\t"
+       "stxvd2x		44, %7, %2	\n\t"
+       "stxvd2x		45, %8, %2	\n\t"
+       "stxvd2x		46, %9, %2	\n\t"
+       "stxvd2x		47, %10, %2	\n"
 
      "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
      :
@@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
        ".p2align	5		\n"
      "1:				\n\t"
 
-       "stxvw4x		%x3, 0, %2	\n\t"
-       "stxvw4x		%x3, %4, %2	\n\t"
-       "stxvw4x		%x3, %5, %2	\n\t"
-       "stxvw4x		%x3, %6, %2	\n\t"
-       "stxvw4x		%x3, %7, %2	\n\t"
-       "stxvw4x		%x3, %8, %2	\n\t"
-       "stxvw4x		%x3, %9, %2	\n\t"
-       "stxvw4x		%x3, %10, %2	\n\t"
+       "stxvd2x		%x3, 0, %2	\n\t"
+       "stxvd2x		%x3, %4, %2	\n\t"
+       "stxvd2x		%x3, %5, %2	\n\t"
+       "stxvd2x		%x3, %6, %2	\n\t"
+       "stxvd2x		%x3, %7, %2	\n\t"
+       "stxvd2x		%x3, %8, %2	\n\t"
+       "stxvd2x		%x3, %9, %2	\n\t"
+       "stxvd2x		%x3, %10, %2	\n\t"
 
        "addi		%2, %2, 128	\n\t"
 

From 34513be72654102504f231cc27b33d26eddf88ac Mon Sep 17 00:00:00 2001
From: Neil Shipp <neilsh@microsoft.com>
Date: Fri, 23 Jun 2017 13:07:34 -0700
Subject: [PATCH 10/42] Add Microsoft Windows 10 UWP build support

---
 CMakeLists.txt                    |  9 ++++-
 cmake/c_check.cmake               |  5 +++
 cmake/prebuild.cmake              | 62 ++++++++++++++++++-------------
 common.h                          | 13 ++++++-
 driver/others/CMakeLists.txt      |  2 +
 driver/others/blas_server_win32.c |  7 +++-
 utest/CMakeLists.txt              |  4 ++
 7 files changed, 70 insertions(+), 32 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c20a57eac..e6ae891b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME}
 	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
 	COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
 	)
-	ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h)	
+
+	ADD_CUSTOM_TARGET(genconfig
+	ALL
+	DEPENDS openblas_config.h
+	)
 	add_dependencies(genconfig ${OpenBLAS_LIBNAME})
 
 	install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
@@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME}
 	message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
 
 	ADD_CUSTOM_TARGET(genf77blas
+	ALL
 	COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
 	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
 	)
@@ -255,11 +260,11 @@ if(NOT NO_CBLAS)
 	message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
 
 	ADD_CUSTOM_TARGET(gencblas
+	ALL
 	COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
 	COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
 	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
 	)
-
 	add_dependencies(gencblas ${OpenBLAS_LIBNAME})
 
 	install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake
index 115bdaf4e..56ae612ea 100644
--- a/cmake/c_check.cmake
+++ b/cmake/c_check.cmake
@@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF}
   "#define __${BINARY}BIT__\t1\n"
   "#define FUNDERSCORE\t${FU}\n")
 
+if (${HOST_OS} STREQUAL "WINDOWSSTORE")
+  file(APPEND ${TARGET_CONF}
+    "#define OS_WINNT\t1\n")
+endif ()
+
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 6a21c0bcc..a7f98bfb8 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -72,20 +72,26 @@ if (MSVC)
 set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
 endif()
 
+if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+  # disable WindowsStore strict CRT checks
+  set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
+endif ()
+
 set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
 set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
 file(MAKE_DIRECTORY ${GETARCH_DIR})
-try_compile(GETARCH_RESULT ${GETARCH_DIR}
-  SOURCES ${GETARCH_SRC}
-  COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
-  OUTPUT_VARIABLE GETARCH_LOG
-  COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
-)
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+  try_compile(GETARCH_RESULT ${GETARCH_DIR}
+    SOURCES ${GETARCH_SRC}
+    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
+    OUTPUT_VARIABLE GETARCH_LOG
+    COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
+  )
 
-if (NOT ${GETARCH_RESULT})
-  MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
+  if (NOT ${GETARCH_RESULT})
+    MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
+  endif ()
 endif ()
-
 message(STATUS "Running getarch")
 
 # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
 set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
 set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
 file(MAKE_DIRECTORY ${GETARCH2_DIR})
-try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
-  SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
-  COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
-  OUTPUT_VARIABLE GETARCH2_LOG
-  COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
-)
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+  try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
+    SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
+    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
+    OUTPUT_VARIABLE GETARCH2_LOG
+    COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
+  )
 
-if (NOT ${GETARCH2_RESULT})
-  MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
+  if (NOT ${GETARCH2_RESULT})
+    MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
+  endif ()
 endif ()
 
 # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
 set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
 file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})
 
-try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
-  SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
-  COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
-  OUTPUT_VARIABLE GEN_CONFIG_H_LOG
-  COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
-)
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+  try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
+    SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
+    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
+    OUTPUT_VARIABLE GEN_CONFIG_H_LOG
+    COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
+  )
 
-if (NOT ${GEN_CONFIG_H_RESULT})
-  MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
-endif ()
+  if (NOT ${GEN_CONFIG_H_RESULT})
+    MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
+  endif ()
+endif ()
\ No newline at end of file
diff --git a/common.h b/common.h
index c9cc2f0f2..4463141c8 100644
--- a/common.h
+++ b/common.h
@@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #endif
 
 #ifndef ASSEMBLER
+#ifdef OS_WINDOWSSTORE
+typedef char env_var_t[MAX_PATH];
+#define readenv(p, n) 0
+#else
 #ifdef OS_WINDOWS
 typedef char env_var_t[MAX_PATH];
 #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
@@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
 typedef char* env_var_t;
 #define readenv(p, n) ((p)=getenv(n))
 #endif
+#endif
 
 #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
 #ifdef _POSIX_MONOTONIC_CLOCK
@@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
   *address = 0;
 }
 
-
+#ifdef OS_WINDOWSSTORE
+static __inline int readenv_atoi(char *env) {
+	return 0;
+}
+#else
 #ifdef OS_WINDOWS
 static __inline int readenv_atoi(char *env) {
   env_var_t p;
@@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
 	return(0);
 }
 #endif
-
+#endif
 
 #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
 
diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt
index 489d40c76..8e0be1e0e 100644
--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@@ -12,6 +12,8 @@ if (SMP)
     set(BLAS_SERVER blas_server_omp.c)
   elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
     set(BLAS_SERVER blas_server_win32.c)
+  elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
+    set(BLAS_SERVER blas_server_win32.c)
   endif ()
 
   if (NOT DEFINED BLAS_SERVER)
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 081bdd7d4..cde8ca793 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
     SetEvent(pool.killed);
 
     for(i = 0; i < blas_num_threads - 1; i++){
-     WaitForSingleObject(blas_threads[i], 5);  //INFINITE);
-	 TerminateThread(blas_threads[i],0);
+      WaitForSingleObject(blas_threads[i], 5);  //INFINITE);
+#ifndef OS_WINDOWSSTORE
+// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
+      TerminateThread(blas_threads[i],0);
+#endif
     }
 
     blas_server_avail = 0;
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 9cf518e05..bd31ed9c6 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -21,6 +21,10 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
 target_link_libraries(${OpenBLAS_utest_bin} m)
 endif()
 
+if (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
+set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES COMPILE_DEFINITIONS "_CRT_SECURE_NO_WARNINGS")
+endif()
+
 #Set output for utest
 set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})

From 9b7b5f7fdc3f60e33c24f8de3dc708a50f7a33e4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 28 Jun 2017 17:38:41 +0200
Subject: [PATCH 11/42] Add Elmar Peise's ReLAPACK

---
 relapack/LICENSE                    |   22 +
 relapack/Makefile                   |   64 ++
 relapack/README.md                  |   68 ++
 relapack/config.h                   |  208 ++++++
 relapack/config.md                  |   87 +++
 relapack/coverage.md                |  212 ++++++
 relapack/inc/relapack.h             |   67 ++
 relapack/src/blas.h                 |   61 ++
 relapack/src/cgbtrf.c               |  230 ++++++
 relapack/src/cgemmt.c               |  167 +++++
 relapack/src/cgetrf.c               |  117 +++
 relapack/src/chegst.c               |  212 ++++++
 relapack/src/chetrf.c               |  236 ++++++
 relapack/src/chetrf_rec2.c          |  520 ++++++++++++++
 relapack/src/chetrf_rook.c          |  236 ++++++
 relapack/src/chetrf_rook_rec2.c     |  661 +++++++++++++++++
 relapack/src/clauum.c               |   87 +++
 relapack/src/cpbtrf.c               |  157 ++++
 relapack/src/cpotrf.c               |   92 +++
 relapack/src/csytrf.c               |  238 ++++++
 relapack/src/csytrf_rec2.c          |  451 ++++++++++++
 relapack/src/csytrf_rook.c          |  236 ++++++
 relapack/src/csytrf_rook_rec2.c     |  565 +++++++++++++++
 relapack/src/ctgsyl.c               |  268 +++++++
 relapack/src/ctrsyl.c               |  163 +++++
 relapack/src/ctrsyl_rec2.c          |  392 ++++++++++
 relapack/src/ctrtri.c               |  107 +++
 relapack/src/dgbtrf.c               |  227 ++++++
 relapack/src/dgemmt.c               |  165 +++++
 relapack/src/dgetrf.c               |  117 +++
 relapack/src/dlauum.c               |   87 +++
 relapack/src/dpbtrf.c               |  157 ++++
 relapack/src/dpotrf.c               |   92 +++
 relapack/src/dsygst.c               |  212 ++++++
 relapack/src/dsytrf.c               |  238 ++++++
 relapack/src/dsytrf_rec2.c          |  352 +++++++++
 relapack/src/dsytrf_rook.c          |  236 ++++++
 relapack/src/dsytrf_rook_rec2.c     |  451 ++++++++++++
 relapack/src/dtgsyl.c               |  274 +++++++
 relapack/src/dtrsyl.c               |  169 +++++
 relapack/src/dtrsyl_rec2.c          | 1034 +++++++++++++++++++++++++++
 relapack/src/dtrtri.c               |  107 +++
 relapack/src/f2c.c                  |  109 +++
 relapack/src/f2c.h                  |  223 ++++++
 relapack/src/lapack.h               |   80 +++
 relapack/src/lapack_wrappers.c      |  607 ++++++++++++++++
 relapack/src/lapack_wrappers.c.orig |  607 ++++++++++++++++
 relapack/src/relapack.h             |   60 ++
 relapack/src/sgbtrf.c               |  227 ++++++
 relapack/src/sgemmt.c               |  165 +++++
 relapack/src/sgetrf.c               |  117 +++
 relapack/src/slauum.c               |   87 +++
 relapack/src/spbtrf.c               |  157 ++++
 relapack/src/spotrf.c               |   92 +++
 relapack/src/ssygst.c               |  212 ++++++
 relapack/src/ssytrf.c               |  238 ++++++
 relapack/src/ssytrf_rec2.c          |  351 +++++++++
 relapack/src/ssytrf_rook.c          |  236 ++++++
 relapack/src/ssytrf_rook_rec2.c     |  451 ++++++++++++
 relapack/src/stgsyl.c               |  274 +++++++
 relapack/src/strsyl.c               |  169 +++++
 relapack/src/strsyl_rec2.c          | 1029 ++++++++++++++++++++++++++
 relapack/src/strtri.c               |  107 +++
 relapack/src/zgbtrf.c               |  230 ++++++
 relapack/src/zgemmt.c               |  167 +++++
 relapack/src/zgetrf.c               |  117 +++
 relapack/src/zhegst.c               |  212 ++++++
 relapack/src/zhetrf.c               |  236 ++++++
 relapack/src/zhetrf_rec2.c          |  524 ++++++++++++++
 relapack/src/zhetrf_rook.c          |  236 ++++++
 relapack/src/zhetrf_rook_rec2.c     |  662 +++++++++++++++++
 relapack/src/zlauum.c               |   87 +++
 relapack/src/zpbtrf.c               |  157 ++++
 relapack/src/zpotrf.c               |   92 +++
 relapack/src/zsytrf.c               |  238 ++++++
 relapack/src/zsytrf_rec2.c          |  452 ++++++++++++
 relapack/src/zsytrf_rook.c          |  236 ++++++
 relapack/src/zsytrf_rook_rec2.c     |  561 +++++++++++++++
 relapack/src/ztgsyl.c               |  268 +++++++
 relapack/src/ztrsyl.c               |  163 +++++
 relapack/src/ztrsyl_rec2.c          |  394 ++++++++++
 relapack/src/ztrtri.c               |  107 +++
 82 files changed, 20579 insertions(+)
 create mode 100644 relapack/LICENSE
 create mode 100644 relapack/Makefile
 create mode 100644 relapack/README.md
 create mode 100644 relapack/config.h
 create mode 100644 relapack/config.md
 create mode 100644 relapack/coverage.md
 create mode 100644 relapack/inc/relapack.h
 create mode 100644 relapack/src/blas.h
 create mode 100644 relapack/src/cgbtrf.c
 create mode 100644 relapack/src/cgemmt.c
 create mode 100644 relapack/src/cgetrf.c
 create mode 100644 relapack/src/chegst.c
 create mode 100644 relapack/src/chetrf.c
 create mode 100644 relapack/src/chetrf_rec2.c
 create mode 100644 relapack/src/chetrf_rook.c
 create mode 100644 relapack/src/chetrf_rook_rec2.c
 create mode 100644 relapack/src/clauum.c
 create mode 100644 relapack/src/cpbtrf.c
 create mode 100644 relapack/src/cpotrf.c
 create mode 100644 relapack/src/csytrf.c
 create mode 100644 relapack/src/csytrf_rec2.c
 create mode 100644 relapack/src/csytrf_rook.c
 create mode 100644 relapack/src/csytrf_rook_rec2.c
 create mode 100644 relapack/src/ctgsyl.c
 create mode 100644 relapack/src/ctrsyl.c
 create mode 100644 relapack/src/ctrsyl_rec2.c
 create mode 100644 relapack/src/ctrtri.c
 create mode 100644 relapack/src/dgbtrf.c
 create mode 100644 relapack/src/dgemmt.c
 create mode 100644 relapack/src/dgetrf.c
 create mode 100644 relapack/src/dlauum.c
 create mode 100644 relapack/src/dpbtrf.c
 create mode 100644 relapack/src/dpotrf.c
 create mode 100644 relapack/src/dsygst.c
 create mode 100644 relapack/src/dsytrf.c
 create mode 100644 relapack/src/dsytrf_rec2.c
 create mode 100644 relapack/src/dsytrf_rook.c
 create mode 100644 relapack/src/dsytrf_rook_rec2.c
 create mode 100644 relapack/src/dtgsyl.c
 create mode 100644 relapack/src/dtrsyl.c
 create mode 100644 relapack/src/dtrsyl_rec2.c
 create mode 100644 relapack/src/dtrtri.c
 create mode 100644 relapack/src/f2c.c
 create mode 100644 relapack/src/f2c.h
 create mode 100644 relapack/src/lapack.h
 create mode 100644 relapack/src/lapack_wrappers.c
 create mode 100644 relapack/src/lapack_wrappers.c.orig
 create mode 100644 relapack/src/relapack.h
 create mode 100644 relapack/src/sgbtrf.c
 create mode 100644 relapack/src/sgemmt.c
 create mode 100644 relapack/src/sgetrf.c
 create mode 100644 relapack/src/slauum.c
 create mode 100644 relapack/src/spbtrf.c
 create mode 100644 relapack/src/spotrf.c
 create mode 100644 relapack/src/ssygst.c
 create mode 100644 relapack/src/ssytrf.c
 create mode 100644 relapack/src/ssytrf_rec2.c
 create mode 100644 relapack/src/ssytrf_rook.c
 create mode 100644 relapack/src/ssytrf_rook_rec2.c
 create mode 100644 relapack/src/stgsyl.c
 create mode 100644 relapack/src/strsyl.c
 create mode 100644 relapack/src/strsyl_rec2.c
 create mode 100644 relapack/src/strtri.c
 create mode 100644 relapack/src/zgbtrf.c
 create mode 100644 relapack/src/zgemmt.c
 create mode 100644 relapack/src/zgetrf.c
 create mode 100644 relapack/src/zhegst.c
 create mode 100644 relapack/src/zhetrf.c
 create mode 100644 relapack/src/zhetrf_rec2.c
 create mode 100644 relapack/src/zhetrf_rook.c
 create mode 100644 relapack/src/zhetrf_rook_rec2.c
 create mode 100644 relapack/src/zlauum.c
 create mode 100644 relapack/src/zpbtrf.c
 create mode 100644 relapack/src/zpotrf.c
 create mode 100644 relapack/src/zsytrf.c
 create mode 100644 relapack/src/zsytrf_rec2.c
 create mode 100644 relapack/src/zsytrf_rook.c
 create mode 100644 relapack/src/zsytrf_rook_rec2.c
 create mode 100644 relapack/src/ztgsyl.c
 create mode 100644 relapack/src/ztrsyl.c
 create mode 100644 relapack/src/ztrsyl_rec2.c
 create mode 100644 relapack/src/ztrtri.c

diff --git a/relapack/LICENSE b/relapack/LICENSE
new file mode 100644
index 000000000..edeb4046e
--- /dev/null
+++ b/relapack/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Elmar Peise
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/relapack/Makefile b/relapack/Makefile
new file mode 100644
index 000000000..1e81b5423
--- /dev/null
+++ b/relapack/Makefile
@@ -0,0 +1,64 @@
+TOPDIR  = ..
+include $(TOPDIR)/Makefile.system
+
+
+
+SRC = $(wildcard src/*.c)
+OBJS = $(SRC:%.c=%.o)
+
+TEST_SUITS = \
+	slauum dlauum clauum zlauum \
+	spotrf dpotrf cpotrf zpotrf \
+	spbtrf dpbtrf cpbtrf zpbtrf \
+	ssygst dsygst chegst zhegst \
+	ssytrf dsytrf csytrf chetrf zsytrf zhetrf \
+	sgetrf dgetrf cgetrf zgetrf \
+	sgbtrf dgbtrf cgbtrf zgbtrf \
+	strsyl dtrsyl ctrsyl ztrsyl \
+	stgsyl dtgsyl ctgsyl ztgsyl \
+	sgemmt dgemmt cgemmt zgemmt
+TESTS = $(TEST_SUITS:%=test/%.pass)  # dummies
+TEST_EXES = $(TEST_SUITS:%=test/%.x)
+
+LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm
+
+.SECONDARY: $(TEST_EXES)
+.PHONY: test
+
+# ReLAPACK compilation
+
+libs:	$(OBJS)
+	@echo "Building ReLAPACK library $(LIBNAME)"
+	$(AR) -r  $(TOPDIR)/$(LIBNAME) $(OBJS)
+	$(RANLIB) $(TOPDIR)/$(LIBNAME)
+
+%.o: %.c config.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+
+# ReLAPACK testing
+
+test: $(TEST_EXES) $(TESTS)
+	@echo "passed all tests"
+
+test/%.pass: test/%.x
+	@echo -n $*:
+	@./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<)
+
+test/s%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+test/d%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+test/c%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+test/z%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+
+# cleaning up
+
+clean:
+	rm -f  $(OBJS) test/util.o test/*.x
diff --git a/relapack/README.md b/relapack/README.md
new file mode 100644
index 000000000..1947c1748
--- /dev/null
+++ b/relapack/README.md
@@ -0,0 +1,68 @@
+ReLAPACK
+========
+
+[![Build Status](https://travis-ci.org/HPAC/ReLAPACK.svg?branch=master)](https://travis-ci.org/HPAC/ReLAPACK)
+
+[Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK)
+
+ReLAPACK offers a collection of recursive algorithms for many of LAPACK's
+compute kernels.  Since it preserves LAPACK's established interfaces, ReLAPACK
+integrates effortlessly into existing application codes.  ReLAPACK's routines
+not only outperform the reference LAPACK but also improve upon the performance
+of tuned implementations, such as OpenBLAS and MKL.
+
+
+Coverage
+--------
+For a detailed list of covered operations and an overview of operations to which
+recursion is not efficiently applicable, see [coverage.md](coverage.md).
+
+
+Installation
+------------
+To compile with the default configuration, simply run `make` to create the
+library `librelapack.a`.
+
+### Linking with MKL
+Note that to link with MKL, you currently need to set the flag
+`COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and
+`ztrsyl`.  For further configuration options see [config.md](config.md).
+
+
+### Dependencies
+ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked
+kernels from [LAPACK](http://www.netlib.org/lapack/).  There are many optimized
+and machine specific implementations of these libraries, which are commonly
+provided by hardware vendors or available as open source (e.g.,
+[OpenBLAS](http://www.openblas.net/)).
+
+
+Testing
+-------
+ReLAPACK's test suite compares its routines numerically with LAPACK's
+counterparts.  To set up the tests (located int `test/`) you need to specify
+link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then
+`make test` runs the tests.  For details on the performed tests, see
+[test/README.md](test/README.md).
+
+
+Examples
+--------
+Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the
+covered routines applies directly to ReLAPACK.  A few separate examples are
+given in `examples/`. For details, see [examples/README.md](examples/README.md).
+
+
+Citing
+------
+When referencing ReLAPACK, please cite the preprint of the paper
+[Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763):
+
+    @article{relapack,
+      author    = {Elmar Peise and Paolo Bientinesi},
+      title     = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection},
+      journal   = {CoRR},
+      volume    = {abs/1602.06763},
+      year      = {2016},
+      url       = {http://arxiv.org/abs/1602.06763},
+    }
diff --git a/relapack/config.h b/relapack/config.h
new file mode 100644
index 000000000..9113a712d
--- /dev/null
+++ b/relapack/config.h
@@ -0,0 +1,208 @@
+#ifndef RELAPACK_CONFIG_H
+#define RELAPACK_CONFIG_H
+
+// ReLAPACK configuration file.
+// See also config.md
+
+
+///////////////////////////////
+// BLAS/LAPACK obect symbols //
+///////////////////////////////
+
+// BLAS routines linked against have a trailing underscore
+#define BLAS_UNDERSCORE 1
+// LAPACK routines linked against have a trailing underscore
+#define LAPACK_UNDERSCORE BLAS_UNDERSCORE
+
+// Complex BLAS/LAPACK routines return their result in the first argument
+// This option must be enabled when linking to MKL for ctrsyl and ztrsyl to
+// work.
+#define COMPLEX_FUNCTIONS_AS_ROUTINES 0
+#ifdef F_INTERFACE_INTEL
+#define COMPLEX_FUNCTIONS_AS_ROUTINES 1
+#endif
+#define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
+#define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
+
+// The BLAS-like extension xgemmt is provided by an external library.
+#define HAVE_XGEMMT 0
+
+
+////////////////////////////
+// Use malloc in ReLAPACK //
+////////////////////////////
+
+#define ALLOW_MALLOC 1
+// allow malloc in xsygst for improved performance
+#define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
+// allow malloc in xsytrf if the passed work buffer is too small
+#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
+
+
+////////////////////////////////
+// LAPACK routine replacement //
+////////////////////////////////
+// The following macros specify which routines are included in the library under
+// LAPACK's symbol names: 1 included, 0 not included
+
+#define INCLUDE_ALL 1
+
+#define INCLUDE_XLAUUM INCLUDE_ALL
+#define INCLUDE_SLAUUM INCLUDE_XLAUUM
+#define INCLUDE_DLAUUM INCLUDE_XLAUUM
+#define INCLUDE_CLAUUM INCLUDE_XLAUUM
+#define INCLUDE_ZLAUUM INCLUDE_XLAUUM
+
+#define INCLUDE_XSYGST INCLUDE_ALL
+#define INCLUDE_SSYGST INCLUDE_XSYGST
+#define INCLUDE_DSYGST INCLUDE_XSYGST
+#define INCLUDE_CHEGST INCLUDE_XSYGST
+#define INCLUDE_ZHEGST INCLUDE_XSYGST
+
+#define INCLUDE_XTRTRI INCLUDE_ALL
+#define INCLUDE_STRTRI INCLUDE_XTRTRI
+#define INCLUDE_DTRTRI INCLUDE_XTRTRI
+#define INCLUDE_CTRTRI INCLUDE_XTRTRI
+#define INCLUDE_ZTRTRI INCLUDE_XTRTRI
+
+#define INCLUDE_XPOTRF INCLUDE_ALL
+#define INCLUDE_SPOTRF INCLUDE_XPOTRF
+#define INCLUDE_DPOTRF INCLUDE_XPOTRF
+#define INCLUDE_CPOTRF INCLUDE_XPOTRF
+#define INCLUDE_ZPOTRF INCLUDE_XPOTRF
+
+#define INCLUDE_XPBTRF INCLUDE_ALL
+#define INCLUDE_SPBTRF INCLUDE_XPBTRF
+#define INCLUDE_DPBTRF INCLUDE_XPBTRF
+#define INCLUDE_CPBTRF INCLUDE_XPBTRF
+#define INCLUDE_ZPBTRF INCLUDE_XPBTRF
+
+#define INCLUDE_XSYTRF INCLUDE_ALL
+#define INCLUDE_SSYTRF INCLUDE_XSYTRF
+#define INCLUDE_DSYTRF INCLUDE_XSYTRF
+#define INCLUDE_CSYTRF INCLUDE_XSYTRF
+#define INCLUDE_CHETRF INCLUDE_XSYTRF
+#define INCLUDE_ZSYTRF INCLUDE_XSYTRF
+#define INCLUDE_ZHETRF INCLUDE_XSYTRF
+#define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF
+#define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF
+#define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF
+#define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF
+#define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF
+#define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF
+
+#define INCLUDE_XGETRF INCLUDE_ALL
+#define INCLUDE_SGETRF INCLUDE_XGETRF
+#define INCLUDE_DGETRF INCLUDE_XGETRF
+#define INCLUDE_CGETRF INCLUDE_XGETRF
+#define INCLUDE_ZGETRF INCLUDE_XGETRF
+
+#define INCLUDE_XGBTRF INCLUDE_ALL
+#define INCLUDE_SGBTRF INCLUDE_XGBTRF
+#define INCLUDE_DGBTRF INCLUDE_XGBTRF
+#define INCLUDE_CGBTRF INCLUDE_XGBTRF
+#define INCLUDE_ZGBTRF INCLUDE_XGBTRF
+
+#define INCLUDE_XTRSYL INCLUDE_ALL
+#define INCLUDE_STRSYL INCLUDE_XTRSYL
+#define INCLUDE_DTRSYL INCLUDE_XTRSYL
+#define INCLUDE_CTRSYL INCLUDE_XTRSYL
+#define INCLUDE_ZTRSYL INCLUDE_XTRSYL
+
+#define INCLUDE_XTGSYL INCLUDE_ALL
+#define INCLUDE_STGSYL INCLUDE_XTGSYL
+#define INCLUDE_DTGSYL INCLUDE_XTGSYL
+#define INCLUDE_CTGSYL INCLUDE_XTGSYL
+#define INCLUDE_ZTGSYL INCLUDE_XTGSYL
+
+#define INCLUDE_XGEMMT 0
+#define INCLUDE_SGEMMT INCLUDE_XGEMMT
+#define INCLUDE_DGEMMT INCLUDE_XGEMMT
+#define INCLUDE_CGEMMT INCLUDE_XGEMMT
+#define INCLUDE_ZGEMMT INCLUDE_XGEMMT
+
+
+/////////////////////
+// crossover sizes //
+/////////////////////
+
+// default crossover size
+#define CROSSOVER 24
+
+// individual crossover sizes
+#define CROSSOVER_XLAUUM CROSSOVER
+#define CROSSOVER_SLAUUM CROSSOVER_XLAUUM
+#define CROSSOVER_DLAUUM CROSSOVER_XLAUUM
+#define CROSSOVER_CLAUUM CROSSOVER_XLAUUM
+#define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM
+
+#define CROSSOVER_XSYGST CROSSOVER
+#define CROSSOVER_SSYGST CROSSOVER_XSYGST
+#define CROSSOVER_DSYGST CROSSOVER_XSYGST
+#define CROSSOVER_CHEGST CROSSOVER_XSYGST
+#define CROSSOVER_ZHEGST CROSSOVER_XSYGST
+
+#define CROSSOVER_XTRTRI CROSSOVER
+#define CROSSOVER_STRTRI CROSSOVER_XTRTRI
+#define CROSSOVER_DTRTRI CROSSOVER_XTRTRI
+#define CROSSOVER_CTRTRI CROSSOVER_XTRTRI
+#define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI
+
+#define CROSSOVER_XPOTRF CROSSOVER
+#define CROSSOVER_SPOTRF CROSSOVER_XPOTRF
+#define CROSSOVER_DPOTRF CROSSOVER_XPOTRF
+#define CROSSOVER_CPOTRF CROSSOVER_XPOTRF
+#define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF
+
+#define CROSSOVER_XPBTRF CROSSOVER
+#define CROSSOVER_SPBTRF CROSSOVER_XPBTRF
+#define CROSSOVER_DPBTRF CROSSOVER_XPBTRF
+#define CROSSOVER_CPBTRF CROSSOVER_XPBTRF
+#define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF
+
+#define CROSSOVER_XSYTRF CROSSOVER
+#define CROSSOVER_SSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_DSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_CSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_CHETRF CROSSOVER_XSYTRF
+#define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_ZHETRF CROSSOVER_XSYTRF
+#define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF
+#define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF
+#define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF
+#define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF
+#define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF
+#define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF
+
+#define CROSSOVER_XGETRF CROSSOVER
+#define CROSSOVER_SGETRF CROSSOVER_XGETRF
+#define CROSSOVER_DGETRF CROSSOVER_XGETRF
+#define CROSSOVER_CGETRF CROSSOVER_XGETRF
+#define CROSSOVER_ZGETRF CROSSOVER_XGETRF
+
+#define CROSSOVER_XGBTRF CROSSOVER
+#define CROSSOVER_SGBTRF CROSSOVER_XGBTRF
+#define CROSSOVER_DGBTRF CROSSOVER_XGBTRF
+#define CROSSOVER_CGBTRF CROSSOVER_XGBTRF
+#define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF
+
+#define CROSSOVER_XTRSYL CROSSOVER
+#define CROSSOVER_STRSYL CROSSOVER_XTRSYL
+#define CROSSOVER_DTRSYL CROSSOVER_XTRSYL
+#define CROSSOVER_CTRSYL CROSSOVER_XTRSYL
+#define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL
+
+#define CROSSOVER_XTGSYL CROSSOVER
+#define CROSSOVER_STGSYL CROSSOVER_XTGSYL
+#define CROSSOVER_DTGSYL CROSSOVER_XTGSYL
+#define CROSSOVER_CTGSYL CROSSOVER_XTGSYL
+#define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL
+
+// sytrf helper routine
+#define CROSSOVER_XGEMMT CROSSOVER_XSYTRF
+#define CROSSOVER_SGEMMT CROSSOVER_XGEMMT
+#define CROSSOVER_DGEMMT CROSSOVER_XGEMMT
+#define CROSSOVER_CGEMMT CROSSOVER_XGEMMT
+#define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT
+
+#endif /* RELAPACK_CONFIG_H */
diff --git a/relapack/config.md b/relapack/config.md
new file mode 100644
index 000000000..ea14be16a
--- /dev/null
+++ b/relapack/config.md
@@ -0,0 +1,87 @@
+RELAPACK Configuration
+======================
+
+ReLAPACK has two configuration files: `make.inc`, which is included by the
+Makefile, and `config.h` which is included in the source files.
+
+
+Build and Testing Environment
+-----------------------------
+The build environment (compiler and flags) and the test configuration (linker
+flags for BLAS and LAPACK) are specified in `make.inc`.  The test matrix size
+and error bounds are defined in `test/config.h`.
+
+The library `librelapack.a` is compiled by invoking `make`.  The tests are
+performed by either `make test` or calling `make` in the test folder.
+
+
+BLAS/LAPACK complex function interfaces
+---------------------------------------
+For BLAS and LAPACK functions that return a complex number, there exist two
+conflicting (FORTRAN compiler dependent) calling conventions: either the result
+is returned as a `struct` of two floating point numbers or an additional first
+argument with a pointer to such a `struct` is used.  By default ReLAPACK uses
+the former (which is what gfortran uses), but it can switch to the latter by
+setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK
+specific counterparts) to `1` in `config.h`.
+
+**For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.**
+
+(Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases
+will segfault or return errors on the order of 1 or larger.)
+
+
+BLAS extension `xgemmt`
+-----------------------
+The LDL decompositions require a general matrix-matrix product that updates only
+a triangular matrix called `xgemmt`.  If the BLAS implementation linked against
+provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`;
+otherwise, ReLAPACK uses its own recursive implementation of these kernels.
+
+`xgemmt` is provided by MKL.
+
+
+Routine Selection
+-----------------
+ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`).  If the
+corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to
+`1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g.,
+`dgetrf_`).  By default, wrappers for all routines are enabled.
+
+
+Crossover Size
+--------------
+The crossover size determines below which matrix sizes ReLAPACK's recursive
+algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3
+routines.  The crossover size is set in `config.h` and can be chosen either
+globally for the entire library, by operation, or individually by routine.
+
+
+Allowing Temporary Buffers
+--------------------------
+Two of ReLAPACK's routines make use of temporary buffers, which are allocated
+and freed within ReLAPACK.  Setting `ALLOW_MALLOC` (or one of the routine
+specific counterparts) to 0 in `config.h` will disable these buffers.  The
+affected routines are:
+
+ * `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2.  As in
+   LAPACK, this size can be queried by setting `lWork = -1` and the passed
+   buffer will be used if it is large enough; only if it is not, a local buffer
+   will be allocated.
+
+   The advantage of this mechanism is that ReLAPACK will seamlessly work even
+   with codes that statically provide too little memory instead of breaking
+   them.
+
+ * `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem
+   to standard form can use an auxiliary buffer of size n^2 / 2 to avoid
+   redundant computations.  It thereby performs about 30% less FLOPs than
+   LAPACK.
+
+
+FORTRAN symbol names
+--------------------
+ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces.
+Since these libraries usually have an underscore to their symbol names, ReLAPACK
+has configuration switches in `config.h` to adjust the corresponding routine
+names.
diff --git a/relapack/coverage.md b/relapack/coverage.md
new file mode 100644
index 000000000..8406b2078
--- /dev/null
+++ b/relapack/coverage.md
@@ -0,0 +1,212 @@
+Coverage of ReLAPACK
+====================
+
+This file lists all LAPACK compute routines that are covered by recursive
+algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which
+are not (yet) part of ReLAPACK.
+
+<!-- START doctoc generated TOC please keep comment here to allow auto update -->
+<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
+**Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
+
+- [List of covered LAPACK routines](#list-of-covered-lapack-routines)
+  - [`xlauum`](#xlauum)
+  - [`xsygst`](#xsygst)
+  - [`xtrtri`](#xtrtri)
+  - [`xpotrf`](#xpotrf)
+  - [`xpbtrf`](#xpbtrf)
+  - [`xsytrf`](#xsytrf)
+  - [`xgetrf`](#xgetrf)
+  - [`xgbtrf`](#xgbtrf)
+  - [`xtrsyl`](#xtrsyl)
+  - [`xtgsyl`](#xtgsyl)
+- [Covered BLAS extension](#covered-blas-extension)
+  - [`xgemmt`](#xgemmt)
+- [Not covered yet](#not-covered-yet)
+  - [`xpstrf`](#xpstrf)
+- [Not covered: extra FLOPs](#not-covered-extra-flops)
+  - [QR decomposition (and related)](#qr-decomposition-and-related)
+  - [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal)
+  - [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal)
+  - [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg)
+
+<!-- END doctoc generated TOC please keep comment here to allow auto update -->
+
+
+List of covered LAPACK routines
+-------------------------------
+
+### `xlauum`
+Multiplication of a triangular matrix with its (complex conjugate) transpose,
+resulting in a symmetric (Hermitian) matrix.
+
+Routines: `slauum`, `dlauum`, `clauum`, `zlauum`
+
+Operations:
+* A = L^T L
+* A = U U^T
+
+### `xsygst`
+Simultaneous two-sided multiplication of a symmetric matrix with a triangular
+matrix and its transpose
+
+Routines: `ssygst`, `dsygst`, `chegst`, `zhegst`
+
+Operations:
+* A = inv(L) A inv(L^T)
+* A = inv(U^T) A inv(U)
+* A = L^T A L
+* A = U A U^T
+
+### `xtrtri`
+Inversion of a triangular matrix
+
+Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri`
+
+Operations:
+* L = inv(L)
+* U = inv(U)
+
+### `xpotrf`
+Cholesky decomposition of a symmetric (Hermitian) positive definite matrix
+
+Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf`
+
+Operations:
+* L L^T = A
+* U^T U = A
+
+### `xpbtrf`
+Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix
+
+Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf`
+
+Operations:
+* L L^T = A
+* U^T U = A
+
+### `xsytrf`
+LDL decomposition of a symmetric (or Hermitian) matrix
+
+Routines:
+* `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`,
+* `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`,
+  `zhetrf_rook`
+
+Operations:
+* L D L^T = A
+* U^T D U = A
+
+### `xgetrf`
+LU decomposition of a general matrix with pivoting
+
+Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf`
+
+Operation: P L U = A
+
+### `xgbtrf`
+LU decomposition of a general banded matrix with pivoting
+
+Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf`
+
+Operation: L U = A
+
+### `xtrsyl`
+Solution of the quasi-triangular Sylvester equation
+
+Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl`
+
+Operations:
+* A X + B Y = C -> X
+* A^T X + B Y = C -> X
+* A X + B^T Y = C -> X
+* A^T X + B^T Y = C -> X
+* A X - B Y = C -> X
+* A^T X - B Y = C -> X
+* A X - B^T Y = C -> X
+* A^T X - B^T Y = C -> X
+
+### `xtgsyl`
+Solution of the generalized Sylvester equations
+
+Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl`
+
+Operations:
+* A R - L B = C, D R - L E = F -> L, R
+* A^T R + D^T L = C, R B^T - L E^T = -F -> L, R
+
+
+Covered BLAS extension
+----------------------
+
+### `xgemmt`
+Matrix-matrix product updating only a triangular part of the result
+
+Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt`
+
+Operations:
+* C = alpha A B + beta C
+* C = alpha A B^T + beta C
+* C = alpha A^T B + beta C
+* C = alpha A^T B^T + beta C
+
+
+Not covered yet
+---------------
+The following operation is implemented as a blocked algorithm in LAPACK but
+currently not yet covered in ReLAPACK as a recursive algorithm
+
+### `xpstrf`
+Cholesky decomposition of a positive semi-definite matrix with complete pivoting.
+
+Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf`
+
+Operations:
+* P L L^T P^T = A
+* P U^T U P^T = A
+
+
+Not covered: extra FLOPs
+------------------------
+The following routines are not covered because recursive variants would require
+considerably more FLOPs or operate on banded matrices.
+
+### QR decomposition (and related)
+Routines:
+* `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf`
+* `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf`
+* `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf`
+* `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf`
+* `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf`
+
+Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A
+
+Routines for multiplication with Q:
+* `sormqr`, `dormqr`, `cunmqr`, `zunmqr`
+* `sormrq`, `dormrq`, `cunmrq`, `zunmrq`
+* `sormql`, `dormql`, `cunmql`, `zunmql`
+* `sormlq`, `dormlq`, `cunmlq`, `zunmlq`
+* `sormrz`, `dormrz`, `cunmrz`, `zunmrz`
+
+Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T
+
+Routines for construction of Q:
+* `sorgqr`, `dorgqr`, `cungqr`, `zungqr`
+* `sorgrq`, `dorgrq`, `cungrq`, `zungrq`
+* `sorgql`, `dorgql`, `cungql`, `zungql`
+* `sorglq`, `dorglq`, `cunglq`, `zunglq`
+
+### Symmetric reduction to tridiagonal
+Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd`
+
+Operation: Q T Q^T = A
+
+### Symmetric reduction to bidiagonal
+Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd`
+
+Operation: Q T P^T = A
+
+### Reduction to upper Hessenberg
+Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd`
+
+Operation: Q H Q^T = A
diff --git a/relapack/inc/relapack.h b/relapack/inc/relapack.h
new file mode 100644
index 000000000..e421f352b
--- /dev/null
+++ b/relapack/inc/relapack.h
@@ -0,0 +1,67 @@
+#ifndef RELAPACK_H
+#define RELAPACK_H
+
+void RELAPACK_slauum(const char *, const int *, float *, const int *, int *);
+void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *);
+void RELAPACK_clauum(const char *, const int *, float *, const int *, int *);
+void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *);
+
+void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *);
+void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *);
+void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *);
+void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *);
+
+void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *);
+void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *);
+void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *);
+void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *);
+
+void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *);
+void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *);
+void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *);
+void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *);
+
+void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+
+void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *);
+void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *);
+
+void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+
+void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+
+void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+
+void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
+void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
+void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+
+void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+#endif /*  RELAPACK_H */
diff --git a/relapack/src/blas.h b/relapack/src/blas.h
new file mode 100644
index 000000000..7441c1033
--- /dev/null
+++ b/relapack/src/blas.h
@@ -0,0 +1,61 @@
+#ifndef BLAS_H
+#define BLAS_H
+
+extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *);
+extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *);
+extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *);
+extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *);
+
+extern void BLAS(sscal)(const int *, const float *, float *, const int *);
+extern void BLAS(dscal)(const int *, const double *, double *, const int *);
+extern void BLAS(cscal)(const int *, const float *, float *, const int *);
+extern void BLAS(zscal)(const int *, const double *, double *, const int *);
+
+extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *);
+extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *);
+
+extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+
+extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+
+extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+
+extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+
+extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
+extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
+extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
+extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
+
+extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+#if HAVE_XGEMMT
+extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+#endif
+
+#endif /* BLAS_H */
diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c
new file mode 100644
index 000000000..90b2c8789
--- /dev/null
+++ b/relapack/src/cgbtrf.c
@@ -0,0 +1,230 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *,
+    const int *, float *, const int *, int *, float *, const int *, float *,
+    const int *, int *);
+
+
+/** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's cgbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html
+ * */
+void RELAPACK_cgbtrf(
+    const int *m, const int *n, const int *kl, const int *ku,
+    float *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kl < 0)
+        *info = -3;
+    else if (*ku < 0)
+        *info = -4;
+    else if (*ldAb < 2 * *kl + *ku + 1)
+        *info = -6;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CGBTRF", &minfo);
+        return;
+    }
+
+    // Constant
+    const float ZERO[] = { 0., 0. };
+
+    // Result upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    float *const A = Ab + 2 * kv;
+
+    // Zero upper diagonal fill-in elements
+    int i, j;
+    for (j = 0; j < *n; j++) {
+        float *const A_j = A + 2 * *ldA * j;
+        for (i = MAX(0, j - kv); i < j - *ku; i++)
+            A_j[2 * i] = A_j[2 * i + 1] = 0.;
+    }
+
+    // Allocate work space
+    const int n1 = CREC_SPLIT(*n);
+    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+    const int nWorkl = (kv > n1) ? n1 : kv;
+    const int mWorku = (*kl > n1) ? n1 : *kl;
+    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float));
+    float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float));
+    LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
+    LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
+
+    // Recursive kernel
+    RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
+
+    // Free work space
+    free(Workl);
+    free(Worku);
+}
+
+
+/** cgbtrf's recursive compute kernel */
+static void RELAPACK_cgbtrf_rec(
+    const int *m, const int *n, const int *kl, const int *ku,
+    float *Ab, const int *ldAb, int *ipiv,
+    float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_CGBTRF, 1)) {
+        // Unblocked
+        LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    // Loop iterators
+    int i, j;
+
+    // Output upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    float *const A = Ab + 2 * kv;
+
+    // Splitting
+    const int n1  = MIN(CREC_SPLIT(*n), *kl);
+    const int n2  = *n - n1;
+    const int m1  = MIN(n1, *m);
+    const int m2  = *m - m1;
+    const int mn1 = MIN(m1, n1);
+    const int mn2 = MIN(m2, n2);
+
+    // Ab_L *
+    //      Ab_BR
+    float *const Ab_L  = Ab;
+    float *const Ab_BR = Ab + 2 * *ldAb * n1;
+
+    // A_L A_R
+    float *const A_L = A;
+    float *const A_R = A + 2 * *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * m1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * m1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // Banded splitting
+    const int n21 = MIN(n2, kv - n1);
+    const int n22 = MIN(n2 - n21, n1);
+    const int m21 = MIN(m2, *kl - m1);
+    const int m22 = MIN(m2 - m21, m1);
+
+    //   n1 n21  n22
+    // m *  A_Rl ARr
+    float *const A_Rl = A_R;
+    float *const A_Rr = A_R + 2 * *ldA * n21;
+
+    //     n1    n21    n22
+    // m1  *     A_TRl  A_TRr
+    // m21 A_BLt A_BRtl A_BRtr
+    // m22 A_BLb A_BRbl A_BRbr
+    float *const A_TRl  = A_TR;
+    float *const A_TRr  = A_TR + 2 * *ldA * n21;
+    float *const A_BLt  = A_BL;
+    float *const A_BLb  = A_BL                  + 2 * m21;
+    float *const A_BRtl = A_BR;
+    float *const A_BRtr = A_BR + 2 * *ldA * n21;
+    float *const A_BRbl = A_BR                  + 2 * m21;
+    float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;
+
+    // recursion(Ab_L, ipiv_T)
+    RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
+
+    // Workl = A_BLb
+    LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
+
+    // partially redo swaps in A_L
+    for (i = 0; i < mn1; i++) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
+            else
+                BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
+        }
+    }
+
+    // apply pivots to A_Rl
+    LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
+
+    // apply pivots to A_Rr columnwise
+    for (j = 0; j < n22; j++) {
+        float *const A_Rrj = A_Rr + 2 * *ldA * j;
+        for (i = j; i < mn1; i++) {
+            const int ip = ipiv_T[i] - 1;
+            if (ip != i) {
+                const float tmpr = A_Rrj[2 * i];
+                const float tmpc = A_Rrj[2 * i + 1];
+                A_Rrj[2 * i]     = A_Rrj[2 * ip];
+                A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1];
+                A_Rrj[2 * ip]     = tmpr;
+                A_Rrj[2 * ip + 1] = tmpc;
+            }
+        }
+    }
+
+    // A_TRl = A_TL \ A_TRl
+    BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+    // Worku = A_TRr
+    LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
+    // Worku = A_TL \ Worku
+    BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
+    // A_TRr = Worku
+    LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
+    // A_BRtl = A_BRtl - A_BLt * A_TRl
+    BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
+    // A_BRbl = A_BRbl - Workl * A_TRl
+    BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
+    // A_BRtr = A_BRtr - A_BLt * Worku
+    BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
+    // A_BRbr = A_BRbr - Workl * Worku
+    BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
+
+    // partially undo swaps in A_L
+    for (i = mn1 - 1; i >= 0; i--) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
+            else
+                BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
+        }
+    }
+
+    // recursion(Ab_BR, ipiv_B)
+    RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+    if (*info)
+        *info += n1;
+    // shift pivots
+    for (i = 0; i < mn2; i++)
+        ipiv_B[i] += n1;
+}
diff --git a/relapack/src/cgemmt.c b/relapack/src/cgemmt.c
new file mode 100644
index 000000000..28e2b00b0
--- /dev/null
+++ b/relapack/src/cgemmt.c
@@ -0,0 +1,167 @@
+#include "relapack.h"
+
+static void RELAPACK_cgemmt_rec(const char *, const char *, const char *,
+    const int *, const int *, const float *, const float *, const int *,
+    const float *, const int *, const float *, float *, const int *);
+
+static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *,
+    const int *, const int *, const float *, const float *, const int *,
+    const float *, const int *, const float *, float *, const int *);
+
+
+/** CGEMMT computes a matrix-matrix product with general matrices but updates
+ * only the upper or lower triangular part of the result matrix.
+ *
+ * This routine performs the same operation as the BLAS routine
+ * cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
+ * but only updates the triangular part of C specified by uplo:
+ * If (*uplo == 'L'), only the lower triangular part of C is updated,
+ * otherwise the upper triangular part is updated.
+ * */
+void RELAPACK_cgemmt(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+
+#if HAVE_XGEMMT
+    BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+    return;
+#else
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int notransA = LAPACK(lsame)(transA, "N");
+    const int tranA = LAPACK(lsame)(transA, "T");
+    const int ctransA = LAPACK(lsame)(transA, "C");
+    const int notransB = LAPACK(lsame)(transB, "N");
+    const int tranB = LAPACK(lsame)(transB, "T");
+    const int ctransB = LAPACK(lsame)(transB, "C");
+    int info = 0;
+    if (!lower && !upper)
+        info = 1;
+    else if (!tranA && !ctransA && !notransA)
+        info = 2;
+    else if (!tranB && !ctransB && !notransB)
+        info = 3;
+    else if (*n < 0)
+        info = 4;
+    else if (*k < 0)
+        info = 5;
+    else if (*ldA < MAX(1, notransA ? *n : *k))
+        info = 8;
+    else if (*ldB < MAX(1, notransB ? *k : *n))
+        info = 10;
+    else if (*ldC < MAX(1, *n))
+        info = 13;
+    if (info) {
+        LAPACK(xerbla)("CGEMMT", &info);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+    const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
+    const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');
+
+    // Recursive kernel
+    RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+#endif
+}
+
+
+/** cgemmt's recursive compute kernel */
+static void RELAPACK_cgemmt_rec(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+
+    if (*n <= MAX(CROSSOVER_CGEMMT, 1)) {
+        // Unblocked
+        RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+        return;
+    }
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_T
+    // A_B
+    const float *const A_T = A;
+    const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);
+
+    // B_L B_R
+    const float *const B_L = B;
+    const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);
+
+    // C_TL C_TR
+    // C_BL C_BR
+    float *const C_TL = C;
+    float *const C_TR = C + 2 * *ldC * n1;
+    float *const C_BL = C                 + 2 * n1;
+    float *const C_BR = C + 2 * *ldC * n1 + 2 * n1;
+
+    // recursion(C_TL)
+    RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
+
+    if (*uplo == 'L')
+        // C_BL = alpha A_B B_L + beta C_BL
+        BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
+    else
+        // C_TR = alpha A_T B_R + beta C_TR
+        BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
+
+    // recursion(C_BR)
+    RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
+}
+
+
+/** cgemmt's unblocked compute kernel */
+static void RELAPACK_cgemmt_rec2(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+
+    const int incB = (*transB == 'N') ? 1 : *ldB;
+    const int incC = 1;
+
+    int i;
+    for (i = 0; i < *n; i++) {
+        // A_0
+        // A_i
+        const float *const A_0 = A;
+        const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);
+
+        // * B_i *
+        const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);
+
+        // * C_0i *
+        // * C_ii *
+        float *const C_0i = C + 2 * *ldC * i;
+        float *const C_ii = C + 2 * *ldC * i + 2 * i;
+
+        if (*uplo == 'L') {
+            const int nmi = *n - i;
+            if (*transA == 'N')
+                BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+            else
+                BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+        } else {
+            const int ip1 = i + 1;
+            if (*transA == 'N')
+                BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+            else
+                BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+        }
+    }
+}
diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c
new file mode 100644
index 000000000..b31a711d0
--- /dev/null
+++ b/relapack/src/cgetrf.c
@@ -0,0 +1,117 @@
+#include "relapack.h"
+
+static void RELAPACK_cgetrf_rec(const int *, const int *, float *,
+    const int *, int *, int *);
+
+
+/** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's cgetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html
+ */
+void RELAPACK_cgetrf(
+    const int *m, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CGETRF", &minfo);
+        return;
+    }
+
+    const int sn = MIN(*m, *n);
+
+    RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info);
+
+    // Right remainder
+    if (*m < *n) {
+        // Constants
+        const float ONE[]  = { 1., 0. };
+        const int   iONE[] = { 1 };
+
+        // Splitting
+        const int rn = *n - *m;
+
+        // A_L A_R
+        const float *const A_L = A;
+        float *const       A_R = A + 2 * *ldA * *m;
+
+        // A_R = apply(ipiv, A_R)
+        LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
+        // A_R = A_L \ A_R
+        BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
+    }
+}
+
+
+/** cgetrf's recursive compute kernel */
+static void RELAPACK_cgetrf_rec(
+    const int *m, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_CGETRF, 1)) {
+        // Unblocked
+        LAPACK(cgetf2)(m, n, A, ldA, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+    const int m2 = *m - n1;
+
+    // A_L A_R
+    float *const A_L = A;
+    float *const A_R = A + 2 * *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // recursion(A_L, ipiv_T)
+    RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
+    // apply pivots to A_R
+    LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
+
+    // A_TR = A_TL \ A_TR
+    BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+    // A_BR = A_BR - A_BL * A_TR
+    BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
+
+    // recursion(A_BR, ipiv_B)
+    RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
+    if (*info)
+        *info += n1;
+    // apply pivots to A_BL
+    LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
+    // shift pivots
+    int i;
+    for (i = 0; i < n2; i++)
+        ipiv_B[i] += n1;
+}
diff --git a/relapack/src/chegst.c b/relapack/src/chegst.c
new file mode 100644
index 000000000..dff875017
--- /dev/null
+++ b/relapack/src/chegst.c
@@ -0,0 +1,212 @@
+#include "relapack.h"
+#if XSYGST_ALLOW_MALLOC
+#include "stdlib.h"
+#endif
+
+static void RELAPACK_chegst_rec(const int *, const char *, const int *,
+    float *, const int *, const float *, const int *,
+    float *, const int *, int *);
+
+
+/** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
+ *
+ * This routine is functionally equivalent to LAPACK's chegst.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html
+ * */
+void RELAPACK_chegst(
+    const int *itype, const char *uplo, const int *n,
+    float *A, const int *ldA, const float *B, const int *ldB,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (*itype < 1 || *itype > 3)
+        *info = -1;
+    else if (!lower && !upper)
+        *info = -2;
+    else if (*n < 0)
+        *info = -3;
+    else if (*ldA < MAX(1, *n))
+        *info = -5;
+    else if (*ldB < MAX(1, *n))
+        *info = -7;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CHEGST", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Allocate work space
+    float *Work = NULL;
+    int   lWork = 0;
+#if XSYGST_ALLOW_MALLOC
+    const int n1 = CREC_SPLIT(*n);
+    lWork = n1 * (*n - n1);
+    Work  = malloc(lWork * 2 * sizeof(float));
+    if (!Work)
+        lWork = 0;
+#endif
+
+    // recursive kernel
+    RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
+
+    // Free work space
+#if XSYGST_ALLOW_MALLOC
+    if (Work)
+        free(Work);
+#endif
+}
+
+
+/** chegst's recursive compute kernel */
+static void RELAPACK_chegst_rec(
+    const int *itype, const char *uplo, const int *n,
+    float *A, const int *ldA, const float *B, const int *ldB,
+    float *Work, const int *lWork, int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_CHEGST, 1)) {
+        // Unblocked
+        LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info);
+        return;
+    }
+
+    // Constants
+    const float ZERO[]  = { 0., 0. };
+    const float ONE[]   = { 1., 0. };
+    const float MONE[]  = { -1., 0. };
+    const float HALF[]  = { .5, 0. };
+    const float MHALF[] = { -.5, 0. };
+    const int   iONE[]  = { 1 };
+
+    // Loop iterator
+    int i;
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // B_TL B_TR
+    // B_BL B_BR
+    const float *const B_TL = B;
+    const float *const B_TR = B + 2 * *ldB * n1;
+    const float *const B_BL = B                 + 2 * n1;
+    const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+    // recursion(A_TL, B_TL)
+    RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
+
+    if (*itype == 1)
+        if (*uplo == 'L') {
+            // A_BL = A_BL / B_TL'
+            BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+            if (*lWork > n2 * n1) {
+                // T = -1/2 * B_BL * A_TL
+                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            } else
+                // A_BL = A_BL - 1/2 B_BL * A_TL
+                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
+            BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
+            if (*lWork > n2 * n1)
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            else
+                // A_BL = A_BL - 1/2 B_BL * A_TL
+                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BL = B_BR \ A_BL
+            BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+        } else {
+            // A_TR = B_TL' \ A_TR
+            BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+            if (*lWork > n2 * n1) {
+                // T = -1/2 * A_TL * B_TR
+                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
+                // A_TR = A_BL + T
+                for (i = 0; i < n2; i++)
+                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            } else
+                // A_TR = A_TR - 1/2 A_TL * B_TR
+                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
+            BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
+            if (*lWork > n2 * n1)
+                // A_TR = A_BL + T
+                for (i = 0; i < n2; i++)
+                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            else
+                // A_TR = A_TR - 1/2 A_TL * B_TR
+                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TR = A_TR / B_BR
+            BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+        }
+    else
+        if (*uplo == 'L') {
+            // A_BL = A_BL * B_TL
+            BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+            if (*lWork > n2 * n1) {
+                // T = 1/2 * A_BR * B_BL
+                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            } else
+                // A_BL = A_BL + 1/2 A_BR * B_BL
+                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
+            BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
+            if (*lWork > n2 * n1)
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            else
+                // A_BL = A_BL + 1/2 A_BR * B_BL
+                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BL = B_BR * A_BL
+            BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+        } else {
+            // A_TR = B_TL * A_TR
+            BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+            if (*lWork > n2 * n1) {
+                // T = 1/2 * B_TR * A_BR
+                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
+                // A_TR = A_TR + T
+                for (i = 0; i < n2; i++)
+                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            } else
+                // A_TR = A_TR + 1/2 B_TR A_BR
+                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
+            BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
+            if (*lWork > n2 * n1)
+                // A_TR = A_TR + T
+                for (i = 0; i < n2; i++)
+                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            else
+                // A_TR = A_TR + 1/2 B_TR * A_BR
+                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TR = A_TR * B_BR
+            BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+        }
+
+    // recursion(A_BR, B_BR)
+    RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
+}
diff --git a/relapack/src/chetrf.c b/relapack/src/chetrf.c
new file mode 100644
index 000000000..2928235e4
--- /dev/null
+++ b/relapack/src/chetrf.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *,
+    float *, const int *, int *, float *, const int *, int *);
+
+
+/** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's chetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html
+ * */
+void RELAPACK_chetrf(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CHETRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** chetrf's recursive compute kernel */
+static void RELAPACK_chetrf_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = CREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        float *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        float *const A_BL   = A                 + 2 * n1;
+        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        float *const A_BL_B = A                 + 2 * *n;
+        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        float *const Work_BL =              Work                    + 2 * n1;
+        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            float *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = CREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        float *const A_TL_T = A + 2 * *ldA * n_rest;
+        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        float *const Work_L  = Work;
+        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/chetrf_rec2.c b/relapack/src/chetrf_rec2.c
new file mode 100644
index 000000000..b5c8341b6
--- /dev/null
+++ b/relapack/src/chetrf_rec2.c
@@ -0,0 +1,520 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
+ *
+ * This routine is a minor modification of LAPACK's clahef.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int *
+	nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
+	int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    float r__1, r__2, r__3, r__4;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    double sqrt(double), r_imag(complex *);
+    void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
+
+    /* Local variables */
+    static int j, k;
+    static float t, r1;
+    static complex d11, d21, d22;
+    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static float alpha;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+	    , complex *, int *, complex *, int *, complex *, complex *
+	    , int *, ftnlen), ccopy_(int *, complex *, int *,
+	    complex *, int *), cswap_(int *, complex *, int *,
+	    complex *, int *);
+    static int kstep;
+    static float absakk;
+    extern /* Subroutine */ int clacgv_(int *, complex *, int *);
+    extern int icamax_(int *, complex *, int *);
+    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+	    *);
+    static float colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.f) + 1.f) / 8.f;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	i__1 = k - 1;
+	ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	i__1 = k + kw * w_dim1;
+	i__2 = k + k * a_dim1;
+	r__1 = a[i__2].r;
+	w[i__1].r = r__1, w[i__1].i = 0.f;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	    i__1 = k + kw * w_dim1;
+	    i__2 = k + kw * w_dim1;
+	    r__1 = w[i__2].r;
+	    w[i__1].r = r__1, w[i__1].i = 0.f;
+	}
+	i__1 = k + kw * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + kw * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + k * a_dim1;
+	    r__1 = a[i__2].r;
+	    a[i__1].r = r__1, a[i__1].i = 0.f;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - 1;
+		ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = imax + (kw - 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		r__1 = a[i__2].r;
+		w[i__1].r = r__1, w[i__1].i = 0.f;
+		i__1 = k - imax;
+		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		i__1 = k - imax;
+		clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    i__2 = imax + (kw - 1) * w_dim1;
+		    r__1 = w[i__2].r;
+		    w[i__1].r = r__1, w[i__1].i = 0.f;
+		}
+		i__1 = k - imax;
+		jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+			 &c__1);
+		i__1 = jmax + (kw - 1) * w_dim1;
+		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			jmax + (kw - 1) * w_dim1]), dabs(r__2));
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+			    r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
+			    r__2));
+		    rowmax = dmax(r__3,r__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
+			kp = imax;
+			ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+				w_dim1 + 1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = kk - 1 - kp;
+		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		i__1 = kk - 1 - kp;
+		clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    i__1 = k + k * a_dim1;
+		    r1 = 1.f / a[i__1].r;
+		    i__1 = k - 1;
+		    csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    i__1 = k - 1;
+		    clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		}
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    r_cnjg(&q__2, &d21);
+		    c_div(&q__1, &w[k + kw * w_dim1], &q__2);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1.f / (q__1.r - 1.f);
+		    q__2.r = t, q__2.i = 0.f;
+		    c_div(&q__1, &q__2, &d21);
+		    d21.r = q__1.r, d21.i = q__1.i;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + k * a_dim1;
+			r_cnjg(&q__2, &d21);
+			i__3 = j + kw * w_dim1;
+			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
+				q__2.r * q__3.i + q__2.i * q__3.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1;
+		clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		i__1 = k - 2;
+		clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    ++j;
+	}
+	++j;
+	if (jp != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+	}
+	if (j <= *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	i__1 = k + k * w_dim1;
+	i__2 = k + k * a_dim1;
+	r__1 = a[i__2].r;
+	w[i__1].r = r__1, w[i__1].i = 0.f;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
+		    w_dim1], &c__1);
+	}
+	i__1 = *n - k + 1;
+	i__2 = k - 1;
+	q__1.r = -1.f, q__1.i = -0.f;
+	cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+	i__1 = k + k * w_dim1;
+	i__2 = k + k * w_dim1;
+	r__1 = w[i__2].r;
+	w[i__1].r = r__1, w[i__1].i = 0.f;
+	i__1 = k + k * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + k * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + k * a_dim1;
+	    r__1 = a[i__2].r;
+	    a[i__1].r = r__1, a[i__1].i = 0.f;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - k;
+		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = imax - k;
+		clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
+		i__1 = imax + (k + 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		r__1 = a[i__2].r;
+		w[i__1].r = r__1, w[i__1].i = 0.f;
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
+			    imax + 1 + (k + 1) * w_dim1], &c__1);
+		}
+		i__1 = *n - k + 1;
+		i__2 = k - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
+			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
+			w_dim1], &c__1, (ftnlen)12);
+		i__1 = imax + (k + 1) * w_dim1;
+		i__2 = imax + (k + 1) * w_dim1;
+		r__1 = w[i__2].r;
+		w[i__1].r = r__1, w[i__1].i = 0.f;
+		i__1 = imax - k;
+		jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+			;
+		i__1 = jmax + (k + 1) * w_dim1;
+		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			jmax + (k + 1) * w_dim1]), dabs(r__2));
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+			    r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
+			    r__2));
+		    rowmax = dmax(r__3,r__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (k + 1) * w_dim1;
+		    if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
+			kp = imax;
+			i__1 = *n - k + 1;
+			ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
+				k * w_dim1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = kp - kk - 1;
+		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		i__1 = kp - kk - 1;
+		clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    i__1 = k + k * a_dim1;
+		    r1 = 1.f / a[i__1].r;
+		    i__1 = *n - k;
+		    csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    i__1 = *n - k;
+		    clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    r_cnjg(&q__2, &d21);
+		    c_div(&q__1, &w[k + k * w_dim1], &q__2);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1.f / (q__1.r - 1.f);
+		    q__2.r = t, q__2.i = 0.f;
+		    c_div(&q__1, &q__2, &d21);
+		    d21.r = q__1.r, d21.i = q__1.i;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			r_cnjg(&q__2, &d21);
+			i__3 = j + k * w_dim1;
+			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
+				q__2.r * q__3.i + q__2.i * q__3.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = *n - k;
+		clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		i__1 = *n - k - 1;
+		clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    --j;
+	}
+	--j;
+	if (jp != jj && j >= 1) {
+	    cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j >= 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/chetrf_rook.c b/relapack/src/chetrf_rook.c
new file mode 100644
index 000000000..086393d57
--- /dev/null
+++ b/relapack/src/chetrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *,
+    float *, const int *, int *, float *, const int *, int *);
+
+
+/** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's chetrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html
+ * */
+void RELAPACK_chetrf_rook(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CHETRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** chetrf_rook's recursive compute kernel */
+static void RELAPACK_chetrf_rook_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = CREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        float *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        float *const A_BL   = A                 + 2 * n1;
+        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        float *const A_BL_B = A                 + 2 * *n;
+        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        float *const Work_BL =              Work                    + 2 * n1;
+        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            float *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = CREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        float *const A_TL_T = A + 2 * *ldA * n_rest;
+        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        float *const Work_L  = Work;
+        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/chetrf_rook_rec2.c b/relapack/src/chetrf_rook_rec2.c
new file mode 100644
index 000000000..a42cbfd44
--- /dev/null
+++ b/relapack/src/chetrf_rook_rec2.c
@@ -0,0 +1,661 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
+ *
+ * This routine is a minor modification of LAPACK's clahef_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n,
+	int *nb, int *kb, complex *a, int *lda, int *ipiv,
+	complex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    float r__1, r__2;
+    complex q__1, q__2, q__3, q__4, q__5;
+
+    /* Builtin functions */
+    double sqrt(double), r_imag(complex *);
+    void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
+
+    /* Local variables */
+    static int j, k, p;
+    static float t, r1;
+    static complex d11, d21, d22;
+    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static logical done;
+    static int imax, jmax;
+    static float alpha;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+	    , complex *, int *, complex *, int *, complex *, complex *
+	    , int *, ftnlen);
+    static float sfmin;
+    extern /* Subroutine */ int ccopy_(int *, complex *, int *,
+	    complex *, int *);
+    static int itemp;
+    extern /* Subroutine */ int cswap_(int *, complex *, int *,
+	    complex *, int *);
+    static int kstep;
+    static float stemp, absakk;
+    extern /* Subroutine */ int clacgv_(int *, complex *, int *);
+    extern int icamax_(int *, complex *, int *);
+    extern double slamch_(char *, ftnlen);
+    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+	    *);
+    static float colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.f) + 1.f) / 8.f;
+    sfmin = slamch_("S", (ftnlen)1);
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	p = k;
+	if (k > 1) {
+	    i__1 = k - 1;
+	    ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
+		    c__1);
+	}
+	i__1 = k + kw * w_dim1;
+	i__2 = k + k * a_dim1;
+	r__1 = a[i__2].r;
+	w[i__1].r = r__1, w[i__1].i = 0.f;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	    i__1 = k + kw * w_dim1;
+	    i__2 = k + kw * w_dim1;
+	    r__1 = w[i__2].r;
+	    w[i__1].r = r__1, w[i__1].i = 0.f;
+	}
+	i__1 = k + kw * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + kw * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + kw * w_dim1;
+	    r__1 = w[i__2].r;
+	    a[i__1].r = r__1, a[i__1].i = 0.f;
+	    if (k > 1) {
+		i__1 = k - 1;
+		ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
+			&c__1);
+	    }
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L12:
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			    w_dim1 + 1], &c__1);
+		}
+		i__1 = imax + (kw - 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		r__1 = a[i__2].r;
+		w[i__1].r = r__1, w[i__1].i = 0.f;
+		i__1 = k - imax;
+		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		i__1 = k - imax;
+		clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    i__2 = imax + (kw - 1) * w_dim1;
+		    r__1 = w[i__2].r;
+		    w[i__1].r = r__1, w[i__1].i = 0.f;
+		}
+		if (imax != k) {
+		    i__1 = k - imax;
+		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
+			    w_dim1], &c__1);
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
+		} else {
+		    rowmax = 0.f;
+		}
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+		    i__1 = itemp + (kw - 1) * w_dim1;
+		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
+		    if (stemp > rowmax) {
+			rowmax = stemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (kw - 1) * w_dim1;
+		if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
+		    kp = imax;
+		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		}
+		if (! done) {
+		    goto L12;
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kstep == 2 && p != k) {
+		i__1 = p + p * a_dim1;
+		i__2 = k + k * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = k - 1 - p;
+		ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+			a_dim1], lda);
+		i__1 = k - 1 - p;
+		clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
+		if (p > 1) {
+		    i__1 = p - 1;
+		    ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
+			    1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
+			    1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+			 ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = kk - 1 - kp;
+		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		i__1 = kk - 1 - kp;
+		clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    i__1 = k + k * a_dim1;
+		    t = a[i__1].r;
+		    if (dabs(t) >= sfmin) {
+			r1 = 1.f / t;
+			i__1 = k - 1;
+			csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    } else {
+			i__1 = k - 1;
+			for (ii = 1; ii <= i__1; ++ii) {
+			    i__2 = ii + k * a_dim1;
+			    i__3 = ii + k * a_dim1;
+			    q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L14: */
+			}
+		    }
+		    i__1 = k - 1;
+		    clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		}
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    r_cnjg(&q__2, &d21);
+		    c_div(&q__1, &w[k + kw * w_dim1], &q__2);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1.f / (q__1.r - 1.f);
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d21);
+			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + k * a_dim1;
+			i__3 = j + kw * w_dim1;
+			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			r_cnjg(&q__5, &d21);
+			c_div(&q__2, &q__3, &q__5);
+			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1;
+		clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		i__1 = k - 2;
+		clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    ++j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	++j;
+	if (jp2 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	++jj;
+	if (kstep == 2 && jp1 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	if (j < *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	p = k;
+	i__1 = k + k * w_dim1;
+	i__2 = k + k * a_dim1;
+	r__1 = a[i__2].r;
+	w[i__1].r = r__1, w[i__1].i = 0.f;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
+		    w_dim1], &c__1);
+	}
+	if (k > 1) {
+	    i__1 = *n - k + 1;
+	    i__2 = k - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
+		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
+		    ftnlen)12);
+	    i__1 = k + k * w_dim1;
+	    i__2 = k + k * w_dim1;
+	    r__1 = w[i__2].r;
+	    w[i__1].r = r__1, w[i__1].i = 0.f;
+	}
+	i__1 = k + k * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + k * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + k * w_dim1;
+	    r__1 = w[i__2].r;
+	    a[i__1].r = r__1, a[i__1].i = 0.f;
+	    if (k < *n) {
+		i__1 = *n - k;
+		ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
+			a_dim1], &c__1);
+	    }
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L72:
+		i__1 = imax - k;
+		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = imax - k;
+		clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
+		i__1 = imax + (k + 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		r__1 = a[i__2].r;
+		w[i__1].r = r__1, w[i__1].i = 0.f;
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
+			    imax + 1 + (k + 1) * w_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = *n - k + 1;
+		    i__2 = k - 1;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
+			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
+			    1) * w_dim1], &c__1, (ftnlen)12);
+		    i__1 = imax + (k + 1) * w_dim1;
+		    i__2 = imax + (k + 1) * w_dim1;
+		    r__1 = w[i__2].r;
+		    w[i__1].r = r__1, w[i__1].i = 0.f;
+		}
+		if (imax != k) {
+		    i__1 = imax - k;
+		    jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+			    c__1);
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[jmax + (k + 1) * w_dim1]), dabs(r__2));
+		} else {
+		    rowmax = 0.f;
+		}
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+		    i__1 = itemp + (k + 1) * w_dim1;
+		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[itemp + (k + 1) * w_dim1]), dabs(r__2));
+		    if (stemp > rowmax) {
+			rowmax = stemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (k + 1) * w_dim1;
+		if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
+		    kp = imax;
+		    i__1 = *n - k + 1;
+		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    i__1 = *n - k + 1;
+		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		}
+		if (! done) {
+		    goto L72;
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kstep == 2 && p != k) {
+		i__1 = p + p * a_dim1;
+		i__2 = k + k * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = p - k - 1;
+		ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
+			a_dim1], lda);
+		i__1 = p - k - 1;
+		clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
+		if (p < *n) {
+		    i__1 = *n - p;
+		    ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
+			    * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+		}
+		cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+		i__1 = kp - kk - 1;
+		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		i__1 = kp - kk - 1;
+		clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    i__1 = k + k * a_dim1;
+		    t = a[i__1].r;
+		    if (dabs(t) >= sfmin) {
+			r1 = 1.f / t;
+			i__1 = *n - k;
+			csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    } else {
+			i__1 = *n;
+			for (ii = k + 1; ii <= i__1; ++ii) {
+			    i__2 = ii + k * a_dim1;
+			    i__3 = ii + k * a_dim1;
+			    q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L74: */
+			}
+		    }
+		    i__1 = *n - k;
+		    clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    r_cnjg(&q__2, &d21);
+		    c_div(&q__1, &w[k + k * w_dim1], &q__2);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1.f / (q__1.r - 1.f);
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			i__3 = j + k * w_dim1;
+			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			r_cnjg(&q__5, &d21);
+			c_div(&q__2, &q__3, &q__5);
+			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d21);
+			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = *n - k;
+		clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		i__1 = *n - k - 1;
+		clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    --j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	--j;
+	if (jp2 != jj && j >= 1) {
+	    cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	--jj;
+	if (kstep == 2 && jp1 != jj && j >= 1) {
+	    cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j > 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/clauum.c b/relapack/src/clauum.c
new file mode 100644
index 000000000..36d6297cf
--- /dev/null
+++ b/relapack/src/clauum.c
@@ -0,0 +1,87 @@
+#include "relapack.h"
+
+static void RELAPACK_clauum_rec(const char *, const int *, float *,
+    const int *, int *);
+
+
+/** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
+ *
+ * This routine is functionally equivalent to LAPACK's clauum.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html
+ * */
+void RELAPACK_clauum(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CLAUUM", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** clauum's recursive compute kernel */
+static void RELAPACK_clauum_rec(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_CLAUUM, 1)) {
+        // Unblocked
+        LAPACK(clauu2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[] = { 1., 0. };
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info);
+
+    if (*uplo == 'L') {
+        // A_TL = A_TL + A_BL' * A_BL
+        BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
+        // A_BL = A_BR' * A_BL
+        BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TL = A_TL + A_TR * A_TR'
+        BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
+        // A_TR = A_TR * A_BR'
+        BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info);
+}
diff --git a/relapack/src/cpbtrf.c b/relapack/src/cpbtrf.c
new file mode 100644
index 000000000..e0ea7b944
--- /dev/null
+++ b/relapack/src/cpbtrf.c
@@ -0,0 +1,157 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *,
+    float *, const int *, float *, const int *, int *);
+
+
+/** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's cpbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html
+ * */
+void RELAPACK_cpbtrf(
+    const char *uplo, const int *n, const int *kd,
+    float *Ab, const int *ldAb,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kd < 0)
+        *info = -3;
+    else if (*ldAb < *kd + 1)
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CPBTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Constant
+    const float ZERO[] = { 0., 0. };
+
+    // Allocate work space
+    const int n1 = CREC_SPLIT(*n);
+    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+    float *Work = malloc(mWork * nWork * 2 * sizeof(float));
+    LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
+
+    // Recursive kernel
+    RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
+
+    // Free work space
+    free(Work);
+}
+
+
+/** cpbtrf's recursive compute kernel */
+static void RELAPACK_cpbtrf_rec(
+    const char *uplo, const int *n, const int *kd,
+    float *Ab, const int *ldAb,
+    float *Work, const int *ldWork,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_CPBTRF, 1)) {
+        // Unblocked
+        LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
+
+    // Splitting
+    const int n1 = MIN(CREC_SPLIT(*n), *kd);
+    const int n2 = *n - n1;
+
+    // * *
+    // * Ab_BR
+    float *const Ab_BR = Ab + 2 * *ldAb * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    // Banded splitting
+    const int n21 = MIN(n2, *kd - n1);
+    const int n22 = MIN(n2 - n21, *kd);
+
+    //     n1    n21    n22
+    // n1  *     A_TRl  A_TRr
+    // n21 A_BLt A_BRtl A_BRtr
+    // n22 A_BLb A_BRbl A_BRbr
+    float *const A_TRl  = A_TR;
+    float *const A_TRr  = A_TR + 2 * *ldA * n21;
+    float *const A_BLt  = A_BL;
+    float *const A_BLb  = A_BL                   + 2 * n21;
+    float *const A_BRtl = A_BR;
+    float *const A_BRtr = A_BR + 2 * *ldA * n21;
+    float *const A_BRbl = A_BR                   + 2 * n21;
+    float *const A_BRbr = A_BR + 2 * *ldA * n21  + 2 * n21;
+
+    if (*uplo == 'L') {
+        // A_BLt = ABLt / A_TL'
+        BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
+        // A_BRtl = A_BRtl - A_BLt * A_BLt'
+        BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
+        // Work = A_BLb
+        LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
+        // Work = Work / A_TL'
+        BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRbl = A_BRbl - Work * A_BLt'
+        BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
+        // A_BRbr = A_BRbr - Work * Work'
+        BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_BLb = Work
+        LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
+    } else {
+        // A_TRl = A_TL' \ A_TRl
+        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+        // A_BRtl = A_BRtl - A_TRl' * A_TRl
+        BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
+        // Work = A_TRr
+        LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
+        // Work = A_TL' \ Work
+        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRtr = A_BRtr - A_TRl' * Work
+        BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
+        // A_BRbr = A_BRbr - Work' * Work
+        BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_TRr = Work
+        LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
+    }
+
+    // recursion(A_BR)
+    if (*kd > n1)
+        RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info);
+    else
+        RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/cpotrf.c b/relapack/src/cpotrf.c
new file mode 100644
index 000000000..e35caa7fa
--- /dev/null
+++ b/relapack/src/cpotrf.c
@@ -0,0 +1,92 @@
+#include "relapack.h"
+
+static void RELAPACK_cpotrf_rec(const char *, const int *, float *,
+        const int *, int *);
+
+
+/** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's cpotrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html
+ * */
+void RELAPACK_cpotrf(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CPOTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** cpotrf's recursive compute kernel */
+static void RELAPACK_cpotrf_rec(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_CPOTRF, 1)) {
+        // Unblocked
+        LAPACK(cpotf2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = A_BL / A_TL'
+        BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
+        // A_BR = A_BR - A_BL * A_BL'
+        BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
+    } else {
+        // A_TR = A_TL' \ A_TR
+        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+        // A_BR = A_BR - A_TR' * A_TR
+        BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/csytrf.c b/relapack/src/csytrf.c
new file mode 100644
index 000000000..01c161d1a
--- /dev/null
+++ b/relapack/src/csytrf.c
@@ -0,0 +1,238 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *,
+    float *, const int *, int *, float *, const int *, int *);
+
+
+/** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's csytrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html
+ * */
+void RELAPACK_csytrf(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CSYTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy arguments
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** csytrf's recursive compute kernel */
+static void RELAPACK_csytrf_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_CSYTRF, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    // Loop iterator
+    int i;
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = CREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        float *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        float *const A_BL   = A                 + 2 * n1;
+        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        float *const A_BL_B = A                 + 2 * *n;
+        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        float *const Work_BL =              Work                    + 2 * n1;
+        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            float *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info  = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = CREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1  = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        float *const A_TL_T = A + 2 * *ldA * n_rest;
+        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        float *const Work_L  = Work;
+        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/csytrf_rec2.c b/relapack/src/csytrf_rec2.c
new file mode 100644
index 000000000..9d6bd849d
--- /dev/null
+++ b/relapack/src/csytrf_rec2.c
@@ -0,0 +1,451 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's clasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int *
+	nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
+	int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    float r__1, r__2, r__3, r__4;
+    complex q__1, q__2, q__3;
+
+    /* Builtin functions */
+    double sqrt(double), r_imag(complex *);
+    void c_div(complex *, complex *, complex *);
+
+    /* Local variables */
+    static int j, k;
+    static complex t, r1, d11, d21, d22;
+    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static float alpha;
+    extern /* Subroutine */ int cscal_(int *, complex *, complex *,
+	    int *);
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+	    , complex *, int *, complex *, int *, complex *, complex *
+	    , int *, ftnlen), ccopy_(int *, complex *, int *,
+	    complex *, int *), cswap_(int *, complex *, int *,
+	    complex *, int *);
+    static int kstep;
+    static float absakk;
+    extern int icamax_(int *, complex *, int *);
+    static float colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.f) + 1.f) / 8.f;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	if (k < *n) {
+	    i__1 = *n - k;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	}
+	kstep = 1;
+	i__1 = k + kw * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
+		w_dim1]), dabs(r__2));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + kw * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = k - imax;
+		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		}
+		i__1 = k - imax;
+		jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+			 &c__1);
+		i__1 = jmax + (kw - 1) * w_dim1;
+		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			jmax + (kw - 1) * w_dim1]), dabs(r__2));
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+			    r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
+			    r__2));
+		    rowmax = dmax(r__3,r__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			    imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha *
+			     rowmax) {
+			kp = imax;
+			ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+				w_dim1 + 1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = kk - 1 - kp;
+		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+		r1.r = q__1.r, r1.i = q__1.i;
+		i__1 = k - 1;
+		cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    c_div(&q__1, &w[k + kw * w_dim1], &d21);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+		    c_div(&q__1, &c_b1, &q__2);
+		    t.r = q__1.r, t.i = q__1.i;
+		    c_div(&q__1, &t, &d21);
+		    d21.r = q__1.r, d21.i = q__1.i;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + k * a_dim1;
+			i__3 = j + kw * w_dim1;
+			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    ++j;
+	}
+	++j;
+	if (jp != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+	}
+	if (j < *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	i__1 = *n - k + 1;
+	ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+	i__1 = *n - k + 1;
+	i__2 = k - 1;
+	q__1.r = -1.f, q__1.i = -0.f;
+	cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+	kstep = 1;
+	i__1 = k + k * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
+		w_dim1]), dabs(r__2));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + k * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - k;
+		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = *n - imax + 1;
+		ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+			1) * w_dim1], &c__1);
+		i__1 = *n - k + 1;
+		i__2 = k - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
+			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
+			w_dim1], &c__1, (ftnlen)12);
+		i__1 = imax - k;
+		jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+			;
+		i__1 = jmax + (k + 1) * w_dim1;
+		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			jmax + (k + 1) * w_dim1]), dabs(r__2));
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+			    r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
+			    r__2));
+		    rowmax = dmax(r__3,r__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (k + 1) * w_dim1;
+		    if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			    imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha *
+			    rowmax) {
+			kp = imax;
+			i__1 = *n - k + 1;
+			ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
+				k * w_dim1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = kp - kk - 1;
+		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+		    r1.r = q__1.r, r1.i = q__1.i;
+		    i__1 = *n - k;
+		    cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k + k * w_dim1], &d21);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+		    c_div(&q__1, &c_b1, &q__2);
+		    t.r = q__1.r, t.i = q__1.i;
+		    c_div(&q__1, &t, &d21);
+		    d21.r = q__1.r, d21.i = q__1.i;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			i__3 = j + k * w_dim1;
+			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+				.i;
+			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+				d21.r * q__2.i + d21.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    --j;
+	}
+	--j;
+	if (jp != jj && j >= 1) {
+	    cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j > 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/csytrf_rook.c b/relapack/src/csytrf_rook.c
new file mode 100644
index 000000000..aa7dd0e57
--- /dev/null
+++ b/relapack/src/csytrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *,
+    float *, const int *, int *, float *, const int *, int *);
+
+
+/** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's csytrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html
+ * */
+void RELAPACK_csytrf_rook(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CSYTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** csytrf_rook's recursive compute kernel */
+static void RELAPACK_csytrf_rook_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = CREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        float *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2   = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        float *const A_BL   = A                 + 2 * n1;
+        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        float *const A_BL_B = A                 + 2 * *n;
+        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        float *const Work_BL =              Work                    + 2 * n1;
+        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            float *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info  = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = CREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        float *const A_TL_T = A + 2 * *ldA * n_rest;
+        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        float *const Work_L  = Work;
+        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/csytrf_rook_rec2.c b/relapack/src/csytrf_rook_rec2.c
new file mode 100644
index 000000000..6638338a6
--- /dev/null
+++ b/relapack/src/csytrf_rook_rec2.c
@@ -0,0 +1,565 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's clasyf_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n,
+	int *nb, int *kb, complex *a, int *lda, int *ipiv,
+	complex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    float r__1, r__2;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    double sqrt(double), r_imag(complex *);
+    void c_div(complex *, complex *, complex *);
+
+    /* Local variables */
+    static int j, k, p;
+    static complex t, r1, d11, d12, d21, d22;
+    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static logical done;
+    static int imax, jmax;
+    static float alpha;
+    extern /* Subroutine */ int cscal_(int *, complex *, complex *,
+	    int *);
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+	    , complex *, int *, complex *, int *, complex *, complex *
+	    , int *, ftnlen);
+    static float sfmin;
+    extern /* Subroutine */ int ccopy_(int *, complex *, int *,
+	    complex *, int *);
+    static int itemp;
+    extern /* Subroutine */ int cswap_(int *, complex *, int *,
+	    complex *, int *);
+    static int kstep;
+    static float stemp, absakk;
+    extern int icamax_(int *, complex *, int *);
+    extern double slamch_(char *, ftnlen);
+    static float colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.f) + 1.f) / 8.f;
+    sfmin = slamch_("S", (ftnlen)1);
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	p = k;
+	ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	if (k < *n) {
+	    i__1 = *n - k;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	}
+	i__1 = k + kw * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
+		w_dim1]), dabs(r__2));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + kw * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L12:
+		ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = k - imax;
+		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		}
+		if (imax != k) {
+		    i__1 = k - imax;
+		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
+			    w_dim1], &c__1);
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
+		} else {
+		    rowmax = 0.f;
+		}
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+		    i__1 = itemp + (kw - 1) * w_dim1;
+		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
+		    if (stemp > rowmax) {
+			rowmax = stemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (kw - 1) * w_dim1;
+		if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha *
+			rowmax)) {
+		    kp = imax;
+		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		}
+		if (! done) {
+		    goto L12;
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kstep == 2 && p != k) {
+		i__1 = k - p;
+		ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+			a_dim1], lda);
+		ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
+			c__1);
+		i__1 = *n - k + 1;
+		cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
+			lda);
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+			 ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + k * a_dim1;
+		i__2 = kk + k * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = k - 1 - kp;
+		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
+			c__1);
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
+			 lda);
+		i__1 = *n - kk + 1;
+		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    i__1 = k + k * a_dim1;
+		    if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+			    + k * a_dim1]), dabs(r__2)) >= sfmin) {
+			c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+			r1.r = q__1.r, r1.i = q__1.i;
+			i__1 = k - 1;
+			cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    } else /* if(complicated condition) */ {
+			i__1 = k + k * a_dim1;
+			if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
+			    i__1 = k - 1;
+			    for (ii = 1; ii <= i__1; ++ii) {
+				i__2 = ii + k * a_dim1;
+				c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
+					a_dim1]);
+				a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L14: */
+			    }
+			}
+		    }
+		}
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d12.r = w[i__1].r, d12.i = w[i__1].i;
+		    c_div(&q__1, &w[k + kw * w_dim1], &d12);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+		    c_div(&q__1, &c_b1, &q__2);
+		    t.r = q__1.r, t.i = q__1.i;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d12);
+			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+				q__2.i + t.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + k * a_dim1;
+			i__3 = j + kw * w_dim1;
+			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d12);
+			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+				q__2.i + t.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    ++j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	++j;
+	if (jp2 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	jj = j - 1;
+	if (jp1 != jj && kstep == 2) {
+	    i__1 = *n - j + 1;
+	    cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	if (j <= *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	p = k;
+	i__1 = *n - k + 1;
+	ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+	if (k > 1) {
+	    i__1 = *n - k + 1;
+	    i__2 = k - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
+		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
+		    ftnlen)12);
+	}
+	i__1 = k + k * w_dim1;
+	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
+		w_dim1]), dabs(r__2));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+		    + k * w_dim1]), dabs(r__2));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = *n - k + 1;
+	    ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+		    c__1);
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L72:
+		i__1 = imax - k;
+		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = *n - imax + 1;
+		ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+			1) * w_dim1], &c__1);
+		if (k > 1) {
+		    i__1 = *n - k + 1;
+		    i__2 = k - 1;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
+			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
+			    1) * w_dim1], &c__1, (ftnlen)12);
+		}
+		if (imax != k) {
+		    i__1 = imax - k;
+		    jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+			    c__1);
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[jmax + (k + 1) * w_dim1]), dabs(r__2));
+		} else {
+		    rowmax = 0.f;
+		}
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+		    i__1 = itemp + (k + 1) * w_dim1;
+		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+			    w[itemp + (k + 1) * w_dim1]), dabs(r__2));
+		    if (stemp > rowmax) {
+			rowmax = stemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (k + 1) * w_dim1;
+		if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+			imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha *
+			rowmax)) {
+		    kp = imax;
+		    i__1 = *n - k + 1;
+		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    i__1 = *n - k + 1;
+		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		}
+		if (! done) {
+		    goto L72;
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kstep == 2 && p != k) {
+		i__1 = p - k;
+		ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
+			lda);
+		i__1 = *n - p + 1;
+		ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
+			c__1);
+		cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+		cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + k * a_dim1;
+		i__2 = kk + k * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = kp - k - 1;
+		ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
+			* a_dim1], lda);
+		i__1 = *n - kp + 1;
+		ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
+			a_dim1], &c__1);
+		cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    i__1 = k + k * a_dim1;
+		    if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+			    + k * a_dim1]), dabs(r__2)) >= sfmin) {
+			c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+			r1.r = q__1.r, r1.i = q__1.i;
+			i__1 = *n - k;
+			cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    } else /* if(complicated condition) */ {
+			i__1 = k + k * a_dim1;
+			if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
+			    i__1 = *n;
+			    for (ii = k + 1; ii <= i__1; ++ii) {
+				i__2 = ii + k * a_dim1;
+				c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
+					a_dim1]);
+				a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L74: */
+			    }
+			}
+		    }
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = q__1.r, d11.i = q__1.i;
+		    c_div(&q__1, &w[k + k * w_dim1], &d21);
+		    d22.r = q__1.r, d22.i = q__1.i;
+		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+		    c_div(&q__1, &c_b1, &q__2);
+		    t.r = q__1.r, t.i = q__1.i;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			i__3 = j + k * w_dim1;
+			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d21);
+			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+				q__2.i + t.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+				.i;
+			c_div(&q__2, &q__3, &d21);
+			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+				q__2.i + t.i * q__2.r;
+			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    --j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	--j;
+	if (jp2 != jj && j >= 1) {
+	    cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	jj = j + 1;
+	if (jp1 != jj && kstep == 2) {
+	    cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j >= 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/ctgsyl.c b/relapack/src/ctgsyl.c
new file mode 100644
index 000000000..15c738baf
--- /dev/null
+++ b/relapack/src/ctgsyl.c
@@ -0,0 +1,268 @@
+#include "relapack.h"
+#include <math.h>
+
+static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *,
+    const int *, const float *, const int *, const float *, const int *,
+    float *, const int *, const float *, const int *, const float *,
+    const int *, float *, const int *, float *, float *, float *, int *);
+
+
+/** CTGSYL solves the generalized Sylvester equation.
+ *
+ * This routine is functionally equivalent to LAPACK's ctgsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html
+ * */
+void RELAPACK_ctgsyl(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC,
+    const float *D, const int *ldD, const float *E, const int *ldE,
+    float *F, const int *ldF,
+    float *scale, float *dif,
+    float *Work, const int *lWork, int *iWork, int *info
+) {
+
+    // Parse arguments
+    const int notran = LAPACK(lsame)(trans, "N");
+    const int tran = LAPACK(lsame)(trans, "C");
+
+    // Compute work buffer size
+    int lwmin = 1;
+    if (notran && (*ijob == 1 || *ijob == 2))
+        lwmin = MAX(1, 2 * *m * *n);
+    *info = 0;
+
+    // Check arguments
+    if (!tran && !notran)
+        *info = -1;
+    else if (notran && (*ijob < 0 || *ijob > 4))
+        *info = -2;
+    else if (*m <= 0)
+        *info = -3;
+    else if (*n <= 0)
+        *info = -4;
+    else if (*ldA < MAX(1, *m))
+        *info = -6;
+    else if (*ldB < MAX(1, *n))
+        *info = -8;
+    else if (*ldC < MAX(1, *m))
+        *info = -10;
+    else if (*ldD < MAX(1, *m))
+        *info = -12;
+    else if (*ldE < MAX(1, *n))
+        *info = -14;
+    else if (*ldF < MAX(1, *m))
+        *info = -16;
+    else if (*lWork < lwmin && *lWork != -1)
+        *info = -20;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CTGSYL", &minfo);
+        return;
+    }
+
+    if (*lWork == -1) {
+        // Work size query
+        *Work = lwmin;
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleantrans = notran ? 'N' : 'C';
+
+    // Constant
+    const float ZERO[] = { 0., 0. };
+
+    int isolve = 1;
+    int ifunc  = 0;
+    if (notran) {
+        if (*ijob >= 3) {
+            ifunc = *ijob - 2;
+            LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
+            LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
+        } else if (*ijob >= 1)
+            isolve = 2;
+    }
+
+    float scale2;
+    int iround;
+    for (iround = 1; iround <= isolve; iround++) {
+        *scale = 1;
+        float dscale = 0;
+        float dsum   = 1;
+        RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
+        if (dscale != 0) {
+            if (*ijob == 1 || *ijob == 3)
+                *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
+            else
+                *dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
+        }
+        if (isolve == 2) {
+            if (iround == 1) {
+                if (notran)
+                    ifunc = *ijob;
+                scale2 = *scale;
+                LAPACK(clacpy)("F", m, n, C, ldC, Work, m);
+                LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
+                LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
+                LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
+            } else {
+                LAPACK(clacpy)("F", m, n, Work, m, C, ldC);
+                LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
+                *scale = scale2;
+            }
+        }
+    }
+}
+
+
+/** ctgsyl's recursive vompute kernel */
+static void RELAPACK_ctgsyl_rec(
+    const char *trans, const int *ifunc, const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC,
+    const float *D, const int *ldD, const float *E, const int *ldE,
+    float *F, const int *ldF,
+    float *scale, float *dsum, float *dscale,
+    int *info
+) {
+
+    if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) {
+        // Unblocked
+        LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const int   iONE[] = { 1 };
+
+    // Outputs
+    float scale1[] = { 1., 0. };
+    float scale2[] = { 1., 0. };
+    int   info1[]  = { 0 };
+    int   info2[]  = { 0 };
+
+    if (*m > *n) {
+        // Splitting
+        const int m1 = CREC_SPLIT(*m);
+        const int m2 = *m - m1;
+
+        // A_TL A_TR
+        // 0    A_BR
+        const float *const A_TL = A;
+        const float *const A_TR = A + 2 * *ldA * m1;
+        const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
+
+        // C_T
+        // C_B
+        float *const C_T = C;
+        float *const C_B = C + 2 * m1;
+
+        // D_TL D_TR
+        // 0    D_BR
+        const float *const D_TL = D;
+        const float *const D_TR = D + 2 * *ldD * m1;
+        const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1;
+
+        // F_T
+        // F_B
+        float *const F_T = F;
+        float *const F_B = F + 2 * m1;
+
+        if (*trans == 'N') {
+            // recursion(A_BR, B, C_B, D_BR, E, F_B)
+            RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
+            // C_T = C_T - A_TR * C_B
+            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+            // F_T = F_T - D_TR * C_B
+            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
+            // recursion(A_TL, B, C_T, D_TL, E, F_T)
+            RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
+            }
+        } else {
+            // recursion(A_TL, B, C_T, D_TL, E, F_T)
+            RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
+            // apply scale
+            if (scale1[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
+            // C_B = C_B - A_TR^H * C_T
+            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+            // C_B = C_B - D_TR^H * F_T
+            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
+            // recursion(A_BR, B, C_B, D_BR, E, F_B)
+            RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
+            }
+        }
+    } else {
+        // Splitting
+        const int n1 = CREC_SPLIT(*n);
+        const int n2 = *n - n1;
+
+        // B_TL B_TR
+        // 0    B_BR
+        const float *const B_TL = B;
+        const float *const B_TR = B + 2 * *ldB * n1;
+        const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+        // C_L C_R
+        float *const C_L = C;
+        float *const C_R = C + 2 * *ldC * n1;
+
+        // E_TL E_TR
+        // 0    E_BR
+        const float *const E_TL = E;
+        const float *const E_TR = E + 2 * *ldE * n1;
+        const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1;
+
+        // F_L F_R
+        float *const F_L = F;
+        float *const F_R = F + 2 * *ldF * n1;
+
+        if (*trans == 'N') {
+            // recursion(A, B_TL, C_L, D, E_TL, F_L)
+            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
+            // C_R = C_R + F_L * B_TR
+            BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
+            // F_R = F_R + F_L * E_TR
+            BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
+            // recursion(A, B_BR, C_R, D, E_BR, F_R)
+            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
+            }
+        } else {
+            // recursion(A, B_BR, C_R, D, E_BR, F_R)
+            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
+            // apply scale
+            if (scale1[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
+            // F_L = F_L + C_R * B_TR
+            BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
+            // F_L = F_L + F_R * E_TR
+            BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
+            // recursion(A, B_TL, C_L, D, E_TL, F_L)
+            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
+            }
+        }
+    }
+
+    *scale = scale1[0] * scale2[0];
+    *info  = info1[0] || info2[0];
+}
diff --git a/relapack/src/ctrsyl.c b/relapack/src/ctrsyl.c
new file mode 100644
index 000000000..b548d5354
--- /dev/null
+++ b/relapack/src/ctrsyl.c
@@ -0,0 +1,163 @@
+#include "relapack.h"
+
+static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *,
+    const int *, const int *, const float *, const int *, const float *,
+    const int *, float *, const int *, float *, int *);
+
+
+/** CTRSYL solves the complex Sylvester matrix equation.
+ *
+ * This routine is functionally equivalent to LAPACK's ctrsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html
+ * */
+void RELAPACK_ctrsyl(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC, float *scale,
+    int *info
+) {
+
+    // Check arguments
+    const int notransA = LAPACK(lsame)(tranA, "N");
+    const int ctransA = LAPACK(lsame)(tranA, "C");
+    const int notransB = LAPACK(lsame)(tranB, "N");
+    const int ctransB = LAPACK(lsame)(tranB, "C");
+    *info = 0;
+    if (!ctransA && !notransA)
+        *info = -1;
+    else if (!ctransB && !notransB)
+        *info = -2;
+    else if (*isgn != 1 && *isgn != -1)
+        *info = -3;
+    else if (*m < 0)
+        *info = -4;
+    else if (*n < 0)
+        *info = -5;
+    else if (*ldA < MAX(1, *m))
+        *info = -7;
+    else if (*ldB < MAX(1, *n))
+        *info = -9;
+    else if (*ldC < MAX(1, *m))
+        *info = -11;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CTRSYL", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleantranA = notransA ? 'N' : 'C';
+    const char cleantranB = notransB ? 'N' : 'C';
+
+    // Recursive kernel
+    RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+
+
+/** ctrsyl's recursive compute kernel */
+static void RELAPACK_ctrsyl_rec(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC, float *scale,
+    int *info
+) {
+
+    if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) {
+        // Unblocked
+        RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+    const float MSGN[] = { -*isgn, 0. };
+    const int   iONE[] = { 1 };
+
+    // Outputs
+    float scale1[] = { 1., 0. };
+    float scale2[] = { 1., 0. };
+    int   info1[]  = { 0 };
+    int   info2[]  = { 0 };
+
+    if (*m > *n) {
+        // Splitting
+        const int m1 = CREC_SPLIT(*m);
+        const int m2 = *m - m1;
+
+        // A_TL A_TR
+        // 0    A_BR
+        const float *const A_TL = A;
+        const float *const A_TR = A + 2 * *ldA * m1;
+        const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
+
+        // C_T
+        // C_B
+        float *const C_T = C;
+        float *const C_B = C + 2 * m1;
+
+        if (*tranA == 'N') {
+            // recusion(A_BR, B, C_B)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
+            // C_T = C_T - A_TR * C_B
+            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+            // recusion(A_TL, B, C_T)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+        } else {
+            // recusion(A_TL, B, C_T)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
+            // C_B = C_B - A_TR' * C_T
+            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+            // recusion(A_BR, B, C_B)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
+        }
+    } else {
+        // Splitting
+        const int n1 = CREC_SPLIT(*n);
+        const int n2 = *n - n1;
+
+        // B_TL B_TR
+        // 0    B_BR
+        const float *const B_TL = B;
+        const float *const B_TR = B + 2 * *ldB * n1;
+        const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+        // C_L C_R
+        float *const C_L = C;
+        float *const C_R = C + 2 * *ldC * n1;
+
+        if (*tranB == 'N') {
+            // recusion(A, B_TL, C_L)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
+            // C_R = C_R -/+ C_L * B_TR
+            BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
+            // recusion(A, B_BR, C_R)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+        } else {
+            // recusion(A, B_BR, C_R)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
+            // C_L = C_L -/+ C_R * B_TR'
+            BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
+            // recusion(A, B_TL, C_L)
+            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+        }
+    }
+
+    *scale = scale1[0] * scale2[0];
+    *info  = info1[0] || info2[0];
+}
diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c
new file mode 100644
index 000000000..518574868
--- /dev/null
+++ b/relapack/src/ctrsyl_rec2.c
@@ -0,0 +1,392 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "../config.h"
+#include "f2c.h"
+
+#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
+complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
+    extern void cdotu_(complex *, int *, complex *, int *, complex *, int *);
+    complex result;
+    cdotu_(&result, n, x, incx, y, incy);
+    return result;
+}
+#define cdotu_ cdotu_fun
+
+complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
+    extern void cdotc_(complex *, int *, complex *, int *, complex *, int *);
+    complex result;
+    cdotc_(&result, n, x, incx, y, incy);
+    return result;
+}
+#define cdotc_ cdotc_fun
+#endif
+
+#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
+complex cladiv_fun(complex *a, complex *b) {
+    extern void cladiv_(complex *, complex *, complex *);
+    complex result;
+    cladiv_(&result, a, b);
+    return result;
+}
+#define cladiv_ cladiv_fun
+#endif
+
+/* Table of constant values */
+
+static int c__1 = 1;
+
+/** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
+ *
+ * This routine is an exact copy of LAPACK's ctrsyl.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * */
+/* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int
+	*isgn, int *m, int *n, complex *a, int *lda, complex *b,
+	int *ldb, complex *c__, int *ldc, float *scale, int *info,
+	ftnlen trana_len, ftnlen tranb_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4;
+    float r__1, r__2;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    float r_imag(complex *);
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    static int j, k, l;
+    static complex a11;
+    static float db;
+    static complex x11;
+    static float da11;
+    static complex vec;
+    static float dum[1], eps, sgn, smin;
+    static complex suml, sumr;
+    /* Complex */ complex cdotc_(int *, complex *, int
+	    *, complex *, int *);
+    extern int lsame_(char *, char *, ftnlen, ftnlen);
+    /* Complex */ complex cdotu_(int *, complex *, int
+	    *, complex *, int *);
+    extern /* Subroutine */ int slabad_(float *, float *);
+    extern float clange_(char *, int *, int *, complex *,
+	    int *, float *, ftnlen);
+    /* Complex */ complex cladiv_(complex *, complex *);
+    static float scaloc;
+    extern float slamch_(char *, ftnlen);
+    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+	    *), xerbla_(char *, int *, ftnlen);
+    static float bignum;
+    static int notrna, notrnb;
+    static float smlnum;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
+    notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
+    *info = 0;
+    if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
+	*info = -1;
+    } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
+	*info = -2;
+    } else if (*isgn != 1 && *isgn != -1) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,*m)) {
+	*info = -7;
+    } else if (*ldb < max(1,*n)) {
+	*info = -9;
+    } else if (*ldc < max(1,*m)) {
+	*info = -11;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CTRSY2", &i__1, (ftnlen)6);
+	return;
+    }
+    *scale = 1.f;
+    if (*m == 0 || *n == 0) {
+	return;
+    }
+    eps = slamch_("P", (ftnlen)1);
+    smlnum = slamch_("S", (ftnlen)1);
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
+    smlnum = smlnum * (float) (*m * *n) / eps;
+    bignum = 1.f / smlnum;
+/* Computing MAX */
+    r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, (
+	    ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n,
+	    &b[b_offset], ldb, dum, (ftnlen)1);
+    smin = dmax(r__1,r__2);
+    sgn = (float) (*isgn);
+    if (notrna && notrnb) {
+	i__1 = *n;
+	for (l = 1; l <= i__1; ++l) {
+	    for (k = *m; k >= 1; --k) {
+		i__2 = *m - k;
+/* Computing MIN */
+		i__3 = k + 1;
+/* Computing MIN */
+		i__4 = k + 1;
+		q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
+			min(i__4,*m) + l * c_dim1], &c__1);
+		suml.r = q__1.r, suml.i = q__1.i;
+		i__2 = l - 1;
+		q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
+			, &c__1);
+		sumr.r = q__1.r, sumr.i = q__1.i;
+		i__2 = k + l * c_dim1;
+		q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
+		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+		q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
+		vec.r = q__1.r, vec.i = q__1.i;
+		scaloc = 1.f;
+		i__2 = k + k * a_dim1;
+		i__3 = l + l * b_dim1;
+		q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i;
+		q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i;
+		a11.r = q__1.r, a11.i = q__1.i;
+		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+			dabs(r__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.f;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+			r__2));
+		if (da11 < 1.f && db > 1.f) {
+		    if (db > bignum * da11) {
+			scaloc = 1.f / db;
+		    }
+		}
+		q__3.r = scaloc, q__3.i = 0.f;
+		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+			q__3.i + vec.i * q__3.r;
+		q__1 = cladiv_(&q__2, &a11);
+		x11.r = q__1.r, x11.i = q__1.i;
+		if (scaloc != 1.f) {
+		    i__2 = *n;
+		    for (j = 1; j <= i__2; ++j) {
+			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L10: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__2 = k + l * c_dim1;
+		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
+/* L20: */
+	    }
+/* L30: */
+	}
+    } else if (! notrna && notrnb) {
+	i__1 = *n;
+	for (l = 1; l <= i__1; ++l) {
+	    i__2 = *m;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = k - 1;
+		q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
+			c_dim1 + 1], &c__1);
+		suml.r = q__1.r, suml.i = q__1.i;
+		i__3 = l - 1;
+		q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
+			, &c__1);
+		sumr.r = q__1.r, sumr.i = q__1.i;
+		i__3 = k + l * c_dim1;
+		q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
+		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+		q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i;
+		vec.r = q__1.r, vec.i = q__1.i;
+		scaloc = 1.f;
+		r_cnjg(&q__2, &a[k + k * a_dim1]);
+		i__3 = l + l * b_dim1;
+		q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
+		q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		a11.r = q__1.r, a11.i = q__1.i;
+		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+			dabs(r__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.f;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+			r__2));
+		if (da11 < 1.f && db > 1.f) {
+		    if (db > bignum * da11) {
+			scaloc = 1.f / db;
+		    }
+		}
+		q__3.r = scaloc, q__3.i = 0.f;
+		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+			q__3.i + vec.i * q__3.r;
+		q__1 = cladiv_(&q__2, &a11);
+		x11.r = q__1.r, x11.i = q__1.i;
+		if (scaloc != 1.f) {
+		    i__3 = *n;
+		    for (j = 1; j <= i__3; ++j) {
+			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L40: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__3 = k + l * c_dim1;
+		c__[i__3].r = x11.r, c__[i__3].i = x11.i;
+/* L50: */
+	    }
+/* L60: */
+	}
+    } else if (! notrna && ! notrnb) {
+	for (l = *n; l >= 1; --l) {
+	    i__1 = *m;
+	    for (k = 1; k <= i__1; ++k) {
+		i__2 = k - 1;
+		q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
+			c_dim1 + 1], &c__1);
+		suml.r = q__1.r, suml.i = q__1.i;
+		i__2 = *n - l;
+/* Computing MIN */
+		i__3 = l + 1;
+/* Computing MIN */
+		i__4 = l + 1;
+		q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
+			l + min(i__4,*n) * b_dim1], ldb);
+		sumr.r = q__1.r, sumr.i = q__1.i;
+		i__2 = k + l * c_dim1;
+		r_cnjg(&q__4, &sumr);
+		q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
+		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+		q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
+		vec.r = q__1.r, vec.i = q__1.i;
+		scaloc = 1.f;
+		i__2 = k + k * a_dim1;
+		i__3 = l + l * b_dim1;
+		q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
+		q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i;
+		r_cnjg(&q__1, &q__2);
+		a11.r = q__1.r, a11.i = q__1.i;
+		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+			dabs(r__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.f;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+			r__2));
+		if (da11 < 1.f && db > 1.f) {
+		    if (db > bignum * da11) {
+			scaloc = 1.f / db;
+		    }
+		}
+		q__3.r = scaloc, q__3.i = 0.f;
+		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+			q__3.i + vec.i * q__3.r;
+		q__1 = cladiv_(&q__2, &a11);
+		x11.r = q__1.r, x11.i = q__1.i;
+		if (scaloc != 1.f) {
+		    i__2 = *n;
+		    for (j = 1; j <= i__2; ++j) {
+			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L70: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__2 = k + l * c_dim1;
+		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
+/* L80: */
+	    }
+/* L90: */
+	}
+    } else if (notrna && ! notrnb) {
+	for (l = *n; l >= 1; --l) {
+	    for (k = *m; k >= 1; --k) {
+		i__1 = *m - k;
+/* Computing MIN */
+		i__2 = k + 1;
+/* Computing MIN */
+		i__3 = k + 1;
+		q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
+			min(i__3,*m) + l * c_dim1], &c__1);
+		suml.r = q__1.r, suml.i = q__1.i;
+		i__1 = *n - l;
+/* Computing MIN */
+		i__2 = l + 1;
+/* Computing MIN */
+		i__3 = l + 1;
+		q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
+			l + min(i__3,*n) * b_dim1], ldb);
+		sumr.r = q__1.r, sumr.i = q__1.i;
+		i__1 = k + l * c_dim1;
+		r_cnjg(&q__4, &sumr);
+		q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
+		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+		q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i;
+		vec.r = q__1.r, vec.i = q__1.i;
+		scaloc = 1.f;
+		i__1 = k + k * a_dim1;
+		r_cnjg(&q__3, &b[l + l * b_dim1]);
+		q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i;
+		q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i;
+		a11.r = q__1.r, a11.i = q__1.i;
+		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+			dabs(r__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.f;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+			r__2));
+		if (da11 < 1.f && db > 1.f) {
+		    if (db > bignum * da11) {
+			scaloc = 1.f / db;
+		    }
+		}
+		q__3.r = scaloc, q__3.i = 0.f;
+		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+			q__3.i + vec.i * q__3.r;
+		q__1 = cladiv_(&q__2, &a11);
+		x11.r = q__1.r, x11.i = q__1.i;
+		if (scaloc != 1.f) {
+		    i__1 = *n;
+		    for (j = 1; j <= i__1; ++j) {
+			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L100: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__1 = k + l * c_dim1;
+		c__[i__1].r = x11.r, c__[i__1].i = x11.i;
+/* L110: */
+	    }
+/* L120: */
+	}
+    }
+    return;
+}
diff --git a/relapack/src/ctrtri.c b/relapack/src/ctrtri.c
new file mode 100644
index 000000000..0262cb59d
--- /dev/null
+++ b/relapack/src/ctrtri.c
@@ -0,0 +1,107 @@
+#include "relapack.h"
+
+static void RELAPACK_ctrtri_rec(const char *, const char *, const int *,
+    float *, const int *, int *);
+
+
+/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's ctrtri.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html
+ * */
+void RELAPACK_ctrtri(
+    const char *uplo, const char *diag, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int nounit = LAPACK(lsame)(diag, "N");
+    const int unit = LAPACK(lsame)(diag, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (!nounit && !unit)
+        *info = -2;
+    else if (*n < 0)
+        *info = -3;
+    else if (*ldA < MAX(1, *n))
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("CTRTRI", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower  ? 'L' : 'U';
+    const char cleandiag = nounit ? 'N' : 'U';
+
+    // check for singularity
+    if (nounit) {
+        int i;
+        for (i = 0; i < *n; i++)
+            if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
+                *info = i;
+                return;
+            }
+    }
+
+    // Recursive kernel
+    RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
+}
+
+
+/** ctrtri's recursive compute kernel */
+static void RELAPACK_ctrtri_rec(
+    const char *uplo, const char *diag, const int *n,
+    float *A, const int *ldA,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_CTRTRI, 1)) {
+        // Unblocked
+        LAPACK(ctrti2)(uplo, diag, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1., 0. };
+    const float MONE[] = { -1., 0. };
+
+    // Splitting
+    const int n1 = CREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + 2 * *ldA * n1;
+    float *const A_BL = A                 + 2 * n1;
+    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = - A_BL * A_TL
+        BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
+        // A_BL = A_BR \ A_BL
+        BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TR = - A_TL * A_TR
+        BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
+        // A_TR = A_TR / A_BR
+        BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c
new file mode 100644
index 000000000..1a1757d31
--- /dev/null
+++ b/relapack/src/dgbtrf.c
@@ -0,0 +1,227 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *,
+    const int *, double *, const int *, int *, double *, const int *, double *,
+    const int *, int *);
+
+
+/** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's dgbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html
+ * */
+void RELAPACK_dgbtrf(
+    const int *m, const int *n, const int *kl, const int *ku,
+    double *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kl < 0)
+        *info = -3;
+    else if (*ku < 0)
+        *info = -4;
+    else if (*ldAb < 2 * *kl + *ku + 1)
+        *info = -6;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DGBTRF", &minfo);
+        return;
+    }
+
+    // Constant
+    const double ZERO[] = { 0. };
+
+    // Result upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    double *const A = Ab + kv;
+
+    // Zero upper diagonal fill-in elements
+    int i, j;
+    for (j = 0; j < *n; j++) {
+        double *const A_j = A + *ldA * j;
+        for (i = MAX(0, j - kv); i < j - *ku; i++)
+            A_j[i] = 0.;
+    }
+
+    // Allocate work space
+    const int n1 = DREC_SPLIT(*n);
+    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+    const int nWorkl = (kv > n1) ? n1 : kv;
+    const int mWorku = (*kl > n1) ? n1 : *kl;
+    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    double *Workl = malloc(mWorkl * nWorkl * sizeof(double));
+    double *Worku = malloc(mWorku * nWorku * sizeof(double));
+    LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
+    LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
+
+    // Recursive kernel
+    RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
+
+    // Free work space
+    free(Workl);
+    free(Worku);
+}
+
+
+/** dgbtrf's recursive compute kernel */
+static void RELAPACK_dgbtrf_rec(
+    const int *m, const int *n, const int *kl, const int *ku,
+    double *Ab, const int *ldAb, int *ipiv,
+    double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_DGBTRF, 1)) {
+        // Unblocked
+        LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+    const int    iONE[] = { 1 };
+
+    // Loop iterators
+    int i, j;
+
+    // Output upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    double *const A = Ab + kv;
+
+    // Splitting
+    const int n1  = MIN(DREC_SPLIT(*n), *kl);
+    const int n2  = *n - n1;
+    const int m1  = MIN(n1, *m);
+    const int m2  = *m - m1;
+    const int mn1 = MIN(m1, n1);
+    const int mn2 = MIN(m2, n2);
+
+    // Ab_L *
+    //      Ab_BR
+    double *const Ab_L  = Ab;
+    double *const Ab_BR = Ab + *ldAb * n1;
+
+    // A_L A_R
+    double *const A_L = A;
+    double *const A_R = A + *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + m1;
+    double *const A_BR = A + *ldA * n1 + m1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // Banded splitting
+    const int n21 = MIN(n2, kv - n1);
+    const int n22 = MIN(n2 - n21, n1);
+    const int m21 = MIN(m2, *kl - m1);
+    const int m22 = MIN(m2 - m21, m1);
+
+    //   n1 n21  n22
+    // m *  A_Rl ARr
+    double *const A_Rl = A_R;
+    double *const A_Rr = A_R + *ldA * n21;
+
+    //     n1    n21    n22
+    // m1  *     A_TRl  A_TRr
+    // m21 A_BLt A_BRtl A_BRtr
+    // m22 A_BLb A_BRbl A_BRbr
+    double *const A_TRl  = A_TR;
+    double *const A_TRr  = A_TR + *ldA * n21;
+    double *const A_BLt  = A_BL;
+    double *const A_BLb  = A_BL              + m21;
+    double *const A_BRtl = A_BR;
+    double *const A_BRtr = A_BR + *ldA * n21;
+    double *const A_BRbl = A_BR              + m21;
+    double *const A_BRbr = A_BR + *ldA * n21 + m21;
+
+    // recursion(Ab_L, ipiv_T)
+    RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
+
+    // Workl = A_BLb
+    LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
+
+    // partially redo swaps in A_L
+    for (i = 0; i < mn1; i++) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
+            else
+                BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
+        }
+    }
+
+    // apply pivots to A_Rl
+    LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
+
+    // apply pivots to A_Rr columnwise
+    for (j = 0; j < n22; j++) {
+        double *const A_Rrj = A_Rr + *ldA * j;
+        for (i = j; i < mn1; i++) {
+            const int ip = ipiv_T[i] - 1;
+            if (ip != i) {
+                const double tmp = A_Rrj[i];
+                A_Rrj[i] = A_Rr[ip];
+                A_Rrj[ip] = tmp;
+            }
+        }
+    }
+
+    // A_TRl = A_TL \ A_TRl
+    BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+    // Worku = A_TRr
+    LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
+    // Worku = A_TL \ Worku
+    BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
+    // A_TRr = Worku
+    LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
+    // A_BRtl = A_BRtl - A_BLt * A_TRl
+    BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
+    // A_BRbl = A_BRbl - Workl * A_TRl
+    BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
+    // A_BRtr = A_BRtr - A_BLt * Worku
+    BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
+    // A_BRbr = A_BRbr - Workl * Worku
+    BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
+
+    // partially undo swaps in A_L
+    for (i = mn1 - 1; i >= 0; i--) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
+            else
+                BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
+        }
+    }
+
+    // recursion(Ab_BR, ipiv_B)
+    RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+    if (*info)
+        *info += n1;
+    // shift pivots
+    for (i = 0; i < mn2; i++)
+        ipiv_B[i] += n1;
+}
diff --git a/relapack/src/dgemmt.c b/relapack/src/dgemmt.c
new file mode 100644
index 000000000..9c925b586
--- /dev/null
+++ b/relapack/src/dgemmt.c
@@ -0,0 +1,165 @@
+#include "relapack.h"
+
+static void RELAPACK_dgemmt_rec(const char *, const char *, const char *,
+    const int *, const int *, const double *, const double *, const int *,
+    const double *, const int *, const double *, double *, const int *);
+
+static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *,
+    const int *, const int *, const double *, const double *, const int *,
+    const double *, const int *, const double *, double *, const int *);
+
+
+/** DGEMMT computes a matrix-matrix product with general matrices but updates
+ * only the upper or lower triangular part of the result matrix.
+ *
+ * This routine performs the same operation as the BLAS routine
+ * dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
+ * but only updates the triangular part of C specified by uplo:
+ * If (*uplo == 'L'), only the lower triangular part of C is updated,
+ * otherwise the upper triangular part is updated.
+ * */
+void RELAPACK_dgemmt(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+
+#if HAVE_XGEMMT
+    BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+    return;
+#else
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int notransA = LAPACK(lsame)(transA, "N");
+    const int tranA = LAPACK(lsame)(transA, "T");
+    const int notransB = LAPACK(lsame)(transB, "N");
+    const int tranB = LAPACK(lsame)(transB, "T");
+    int info = 0;
+    if (!lower && !upper)
+        info = 1;
+    else if (!tranA && !notransA)
+        info = 2;
+    else if (!tranB && !notransB)
+        info = 3;
+    else if (*n < 0)
+        info = 4;
+    else if (*k < 0)
+        info = 5;
+    else if (*ldA < MAX(1, notransA ? *n : *k))
+        info = 8;
+    else if (*ldB < MAX(1, notransB ? *k : *n))
+        info = 10;
+    else if (*ldC < MAX(1, *n))
+        info = 13;
+    if (info) {
+        LAPACK(xerbla)("DGEMMT", &info);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+    const char cleantransA = notransA ? 'N' : 'T';
+    const char cleantransB = notransB ? 'N' : 'T';
+
+    // Recursive kernel
+    RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+#endif
+}
+
+
+/** dgemmt's recursive compute kernel */
+static void RELAPACK_dgemmt_rec(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+
+    if (*n <= MAX(CROSSOVER_DGEMMT, 1)) {
+        // Unblocked
+        RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+        return;
+    }
+
+    // Splitting
+    const int n1 = DREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_T
+    // A_B
+    const double *const A_T = A;
+    const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);
+
+    // B_L B_R
+    const double *const B_L = B;
+    const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);
+
+    // C_TL C_TR
+    // C_BL C_BR
+    double *const C_TL = C;
+    double *const C_TR = C + *ldC * n1;
+    double *const C_BL = C             + n1;
+    double *const C_BR = C + *ldC * n1 + n1;
+
+    // recursion(C_TL)
+    RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
+
+    if (*uplo == 'L')
+        // C_BL = alpha A_B B_L + beta C_BL
+        BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
+    else
+        // C_TR = alpha A_T B_R + beta C_TR
+        BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
+
+    // recursion(C_BR)
+    RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
+}
+
+
+/** dgemmt's unblocked compute kernel */
+static void RELAPACK_dgemmt_rec2(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+
+    const int incB = (*transB == 'N') ? 1 : *ldB;
+    const int incC = 1;
+
+    int i;
+    for (i = 0; i < *n; i++) {
+        // A_0
+        // A_i
+        const double *const A_0 = A;
+        const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i);
+
+        // * B_i *
+        const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i);
+
+        // * C_0i *
+        // * C_ii *
+        double *const C_0i = C + *ldC * i;
+        double *const C_ii = C + *ldC * i + i;
+
+        if (*uplo == 'L') {
+            const int nmi = *n - i;
+            if (*transA == 'N')
+                BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+            else
+                BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+        } else {
+            const int ip1 = i + 1;
+            if (*transA == 'N')
+                BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+            else
+                BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+        }
+    }
+}
diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c
new file mode 100644
index 000000000..07f5472fd
--- /dev/null
+++ b/relapack/src/dgetrf.c
@@ -0,0 +1,117 @@
+#include "relapack.h"
+
+static void RELAPACK_dgetrf_rec(const int *, const int *, double *,
+    const int *, int *, int *);
+
+
+/** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's dgetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html
+ * */
+void RELAPACK_dgetrf(
+    const int *m, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DGETRF", &minfo);
+        return;
+    }
+
+    const int sn = MIN(*m, *n);
+
+    RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info);
+
+    // Right remainder
+    if (*m < *n) {
+        // Constants
+        const double ONE[] = { 1. };
+        const int   iONE[] = { 1. };
+
+        // Splitting
+        const int rn = *n - *m;
+
+        // A_L A_R
+        const double *const A_L = A;
+        double *const       A_R = A + *ldA * *m;
+
+        // A_R = apply(ipiv, A_R)
+        LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
+        // A_R = A_S \ A_R
+        BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
+    }
+}
+
+
+/** dgetrf's recursive compute kernel */
+static void RELAPACK_dgetrf_rec(
+    const int *m, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_DGETRF, 1)) {
+        // Unblocked
+        LAPACK(dgetf2)(m, n, A, ldA, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+    const int    iONE[] = { 1 };
+
+    // Splitting
+    const int n1 = DREC_SPLIT(*n);
+    const int n2 = *n - n1;
+    const int m2 = *m - n1;
+
+    // A_L A_R
+    double *const A_L = A;
+    double *const A_R = A + *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + n1;
+    double *const A_BR = A + *ldA * n1 + n1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // recursion(A_L, ipiv_T)
+    RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
+    // apply pivots to A_R
+    LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
+
+    // A_TR = A_TL \ A_TR
+    BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+    // A_BR = A_BR - A_BL * A_TR
+    BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
+
+    // recursion(A_BR, ipiv_B)
+    RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
+    if (*info)
+        *info += n1;
+    // apply pivots to A_BL
+    LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
+    // shift pivots
+    int i;
+    for (i = 0; i < n2; i++)
+        ipiv_B[i] += n1;
+}
diff --git a/relapack/src/dlauum.c b/relapack/src/dlauum.c
new file mode 100644
index 000000000..d722ea809
--- /dev/null
+++ b/relapack/src/dlauum.c
@@ -0,0 +1,87 @@
+#include "relapack.h"
+
+static void RELAPACK_dlauum_rec(const char *, const int *, double *,
+    const int *, int *);
+
+
+/** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
+ *
+ * This routine is functionally equivalent to LAPACK's dlauum.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html
+ * */
+void RELAPACK_dlauum(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DLAUUM", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** dlauum's recursive compute kernel */
+static void RELAPACK_dlauum_rec(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_DLAUUM, 1)) {
+        // Unblocked
+        LAPACK(dlauu2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[] = { 1. };
+
+    // Splitting
+    const int n1 = DREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + n1;
+    double *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info);
+
+    if (*uplo == 'L') {
+        // A_TL = A_TL + A_BL' * A_BL
+        BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
+        // A_BL = A_BR' * A_BL
+        BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TL = A_TL + A_TR * A_TR'
+        BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
+        // A_TR = A_TR * A_BR'
+        BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info);
+}
diff --git a/relapack/src/dpbtrf.c b/relapack/src/dpbtrf.c
new file mode 100644
index 000000000..6fd0ebe48
--- /dev/null
+++ b/relapack/src/dpbtrf.c
@@ -0,0 +1,157 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *,
+    double *, const int *, double *, const int *, int *);
+
+
+/** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's dpbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html
+ * */
+void RELAPACK_dpbtrf(
+    const char *uplo, const int *n, const int *kd,
+    double *Ab, const int *ldAb,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kd < 0)
+        *info = -3;
+    else if (*ldAb < *kd + 1)
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DPBTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Constant
+    const double ZERO[] = { 0. };
+
+    // Allocate work space
+    const int n1 = DREC_SPLIT(*n);
+    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+    double *Work = malloc(mWork * nWork * sizeof(double));
+    LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
+
+    // Recursive kernel
+    RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
+
+    // Free work space
+    free(Work);
+}
+
+
+/** dpbtrf's recursive compute kernel */
+static void RELAPACK_dpbtrf_rec(
+    const char *uplo, const int *n, const int *kd,
+    double *Ab, const int *ldAb,
+    double *Work, const int *ldWork,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_DPBTRF, 1)) {
+        // Unblocked
+        LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    double *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
+
+    // Splitting
+    const int n1 = MIN(DREC_SPLIT(*n), *kd);
+    const int n2 = *n - n1;
+
+    // * *
+    // * Ab_BR
+    double *const Ab_BR = Ab + *ldAb * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + n1;
+    double *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    // Banded splitting
+    const int n21 = MIN(n2, *kd - n1);
+    const int n22 = MIN(n2 - n21, n1);
+
+    //     n1    n21    n22
+    // n1  *     A_TRl  A_TRr
+    // n21 A_BLt A_BRtl A_BRtr
+    // n22 A_BLb A_BRbl A_BRbr
+    double *const A_TRl  = A_TR;
+    double *const A_TRr  = A_TR + *ldA * n21;
+    double *const A_BLt  = A_BL;
+    double *const A_BLb  = A_BL               + n21;
+    double *const A_BRtl = A_BR;
+    double *const A_BRtr = A_BR + *ldA * n21;
+    double *const A_BRbl = A_BR               + n21;
+    double *const A_BRbr = A_BR + *ldA * n21  + n21;
+
+    if (*uplo == 'L') {
+        // A_BLt = ABLt / A_TL'
+        BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
+        // A_BRtl = A_BRtl - A_BLt * A_BLt'
+        BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
+        // Work = A_BLb
+        LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
+        // Work = Work / A_TL'
+        BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRbl = A_BRbl - Work * A_BLt'
+        BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
+        // A_BRbr = A_BRbr - Work * Work'
+        BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_BLb = Work
+        LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
+    } else {
+        // A_TRl = A_TL' \ A_TRl
+        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+        // A_BRtl = A_BRtl - A_TRl' * A_TRl
+        BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
+        // Work = A_TRr
+        LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
+        // Work = A_TL' \ Work
+        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRtr = A_BRtr - A_TRl' * Work
+        BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
+        // A_BRbr = A_BRbr - Work' * Work
+        BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_TRr = Work
+        LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
+    }
+
+    // recursion(A_BR)
+    if (*kd > n1)
+        RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info);
+    else
+        RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/dpotrf.c b/relapack/src/dpotrf.c
new file mode 100644
index 000000000..c14fb3d71
--- /dev/null
+++ b/relapack/src/dpotrf.c
@@ -0,0 +1,92 @@
+#include "relapack.h"
+
+static void RELAPACK_dpotrf_rec(const char *, const int *, double *,
+        const int *, int *);
+
+
+/** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's dpotrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
+ * */
+void RELAPACK_dpotrf(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DPOTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** dpotrf's recursive compute kernel */
+static void RELAPACK_dpotrf_rec(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_DPOTRF, 1)) {
+        // Unblocked
+        LAPACK(dpotf2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+
+    // Splitting
+    const int n1 = DREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + n1;
+    double *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = A_BL / A_TL'
+        BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
+        // A_BR = A_BR - A_BL * A_BL'
+        BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
+    } else {
+        // A_TR = A_TL' \ A_TR
+        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+        // A_BR = A_BR - A_TR' * A_TR
+        BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/dsygst.c b/relapack/src/dsygst.c
new file mode 100644
index 000000000..0228068ce
--- /dev/null
+++ b/relapack/src/dsygst.c
@@ -0,0 +1,212 @@
+#include "relapack.h"
+#if XSYGST_ALLOW_MALLOC
+#include "stdlib.h"
+#endif
+
+static void RELAPACK_dsygst_rec(const int *, const char *, const int *,
+    double *, const int *, const double *, const int *,
+    double *, const int *, int *);
+
+
+/** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
+ *
+ * This routine is functionally equivalent to LAPACK's dsygst.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html
+ * */
+void RELAPACK_dsygst(
+    const int *itype, const char *uplo, const int *n,
+    double *A, const int *ldA, const double *B, const int *ldB,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (*itype < 1 || *itype > 3)
+        *info = -1;
+    else if (!lower && !upper)
+        *info = -2;
+    else if (*n < 0)
+        *info = -3;
+    else if (*ldA < MAX(1, *n))
+        *info = -5;
+    else if (*ldB < MAX(1, *n))
+        *info = -7;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DSYGST", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Allocate work space
+    double *Work = NULL;
+    int    lWork = 0;
+#if XSYGST_ALLOW_MALLOC
+    const int n1 = DREC_SPLIT(*n);
+    lWork = n1 * (*n - n1);
+    Work  = malloc(lWork * sizeof(double));
+    if (!Work)
+        lWork = 0;
+#endif
+
+    // recursive kernel
+    RELAPACK_dsygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
+
+    // Free work space
+#if XSYGST_ALLOW_MALLOC
+    if (Work)
+        free(Work);
+#endif
+}
+
+
+/** dsygst's recursive compute kernel */
+static void RELAPACK_dsygst_rec(
+    const int *itype, const char *uplo, const int *n,
+    double *A, const int *ldA, const double *B, const int *ldB,
+    double *Work, const int *lWork, int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
+        // Unblocked
+        LAPACK(dsygs2)(itype, uplo, n, A, ldA, B, ldB, info);
+        return;
+    }
+
+    // Constants
+    const double ZERO[]  = { 0. };
+    const double ONE[]   = { 1. };
+    const double MONE[]  = { -1. };
+    const double HALF[]  = { .5 };
+    const double MHALF[] = { -.5 };
+    const int    iONE[]  = { 1 };
+
+    // Loop iterator
+    int i;
+
+    // Splitting
+    const int n1 = DREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + n1;
+    double *const A_BR = A + *ldA * n1 + n1;
+
+    // B_TL B_TR
+    // B_BL B_BR
+    const double *const B_TL = B;
+    const double *const B_TR = B + *ldB * n1;
+    const double *const B_BL = B             + n1;
+    const double *const B_BR = B + *ldB * n1 + n1;
+
+    // recursion(A_TL, B_TL)
+    RELAPACK_dsygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
+
+    if (*itype == 1)
+        if (*uplo == 'L') {
+            // A_BL = A_BL / B_TL'
+            BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+            if (*lWork > n2 * n1) {
+                // T = -1/2 * B_BL * A_TL
+                BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+            } else
+                // A_BL = A_BL - 1/2 B_BL * A_TL
+                BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
+            BLAS(dsyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
+            if (*lWork > n2 * n1)
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+            else
+                // A_BL = A_BL - 1/2 B_BL * A_TL
+                BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BL = B_BR \ A_BL
+            BLAS(dtrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+        } else {
+            // A_TR = B_TL' \ A_TR
+            BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+            if (*lWork > n2 * n1) {
+                // T = -1/2 * A_TL * B_TR
+                BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
+                // A_TR = A_BL + T
+                for (i = 0; i < n2; i++)
+                    BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+            } else
+                // A_TR = A_TR - 1/2 A_TL * B_TR
+                BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
+            BLAS(dsyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
+            if (*lWork > n2 * n1)
+                // A_TR = A_BL + T
+                for (i = 0; i < n2; i++)
+                    BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+            else
+                // A_TR = A_TR - 1/2 A_TL * B_TR
+                BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TR = A_TR / B_BR
+            BLAS(dtrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+        }
+    else
+        if (*uplo == 'L') {
+            // A_BL = A_BL * B_TL
+            BLAS(dtrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+            if (*lWork > n2 * n1) {
+                // T = 1/2 * A_BR * B_BL
+                BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+            } else
+                // A_BL = A_BL + 1/2 A_BR * B_BL
+                BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
+            BLAS(dsyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
+            if (*lWork > n2 * n1)
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+            else
+                // A_BL = A_BL + 1/2 A_BR * B_BL
+                BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BL = B_BR * A_BL
+            BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+        } else {
+            // A_TR = B_TL * A_TR
+            BLAS(dtrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+            if (*lWork > n2 * n1) {
+                // T = 1/2 * B_TR * A_BR
+                BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
+                // A_TR = A_TR + T
+                for (i = 0; i < n2; i++)
+                    BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+            } else
+                // A_TR = A_TR + 1/2 B_TR A_BR
+                BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
+            BLAS(dsyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
+            if (*lWork > n2 * n1)
+                // A_TR = A_TR + T
+                for (i = 0; i < n2; i++)
+                    BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+            else
+                // A_TR = A_TR + 1/2 B_TR * A_BR
+                BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TR = A_TR * B_BR
+            BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+        }
+
+    // recursion(A_BR, B_BR)
+    RELAPACK_dsygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
+}
diff --git a/relapack/src/dsytrf.c b/relapack/src/dsytrf.c
new file mode 100644
index 000000000..80b119336
--- /dev/null
+++ b/relapack/src/dsytrf.c
@@ -0,0 +1,238 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *,
+    double *, const int *, int *, double *, const int *, int *);
+
+
+/** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's dsytrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html
+ * */
+void RELAPACK_dsytrf(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * sizeof(double));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DSYTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy arguments
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** dsytrf's recursive compute kernel */
+static void RELAPACK_dsytrf_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_DSYTRF, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_dsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+    const int    iONE[] = { 1 };
+
+    // Loop iterator
+    int i;
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = DREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        double *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        double *const A_BL   = A             + n1;
+        double *const A_BR   = A + *ldA * n1 + n1;
+        double *const A_BL_B = A             + *n;
+        double *const A_BR_B = A + *ldA * n1 + *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        double *const Work_BL =              Work                + n1;
+        double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
+
+            // last row of A_BL
+            double *const A_BL_b = A_BL + n2_out;
+
+            // last row of Work_BL
+            double *const Work_BL_b = Work_BL + n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info  = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = DREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        double *const Work_R = top ? Work : Work + *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1  = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        double *const A_TL_T = A + *ldA * n_rest;
+        double *const A_TR_T = A + *ldA * (n_rest + n1);
+        double *const A_TL   = A + *ldA * n_rest        + n_rest;
+        double *const A_TR   = A + *ldA * (n_rest + n1) + n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        double *const Work_L  = Work;
+        double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/dsytrf_rec2.c b/relapack/src/dsytrf_rec2.c
new file mode 100644
index 000000000..72ef827b1
--- /dev/null
+++ b/relapack/src/dsytrf_rec2.c
@@ -0,0 +1,352 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static double c_b8 = -1.;
+static double c_b9 = 1.;
+
+/** DSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's dlasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int *
+	nb, int *kb, double *a, int *lda, int *ipiv,
+	double *w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+    double d__1, d__2, d__3;
+
+    /* Builtin functions */
+    double sqrt(double);
+
+    /* Local variables */
+    static int j, k;
+    static double t, r1, d11, d21, d22;
+    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static double alpha;
+    extern /* Subroutine */ int dscal_(int *, double *, double *,
+	    int *);
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int dgemv_(char *, int *, int *,
+	    double *, double *, int *, double *, int *,
+	    double *, double *, int *, ftnlen), dcopy_(int *,
+	    double *, int *, double *, int *), dswap_(int
+	    *, double *, int *, double *, int *);
+    static int kstep;
+    static double absakk;
+    extern int idamax_(int *, double *, int *);
+    static double colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.) + 1.) / 8.;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	if (k < *n) {
+	    i__1 = *n - k;
+	    dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	}
+	kstep = 1;
+	absakk = (d__1 = w[k + kw * w_dim1], abs(d__1));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = k - imax;
+		dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		}
+		i__1 = k - imax;
+		jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+			 &c__1);
+		rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1));
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    jmax = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+		    d__2 = rowmax, d__3 = (d__1 = w[jmax + (kw - 1) * w_dim1],
+			     abs(d__1));
+		    rowmax = max(d__2,d__3);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else if ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) >=
+			alpha * rowmax) {
+		    kp = imax;
+		    dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		} else {
+		    kp = imax;
+		    kstep = 2;
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kp != kk) {
+		a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
+		i__1 = kk - 1 - kp;
+		dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    dcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    dswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		r1 = 1. / a[k + k * a_dim1];
+		i__1 = k - 1;
+		dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+	    } else {
+		if (k > 2) {
+		    d21 = w[k - 1 + kw * w_dim1];
+		    d11 = w[k + kw * w_dim1] / d21;
+		    d22 = w[k - 1 + (kw - 1) * w_dim1] / d21;
+		    t = 1. / (d11 * d22 - 1.);
+		    d21 = t / d21;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1)
+				* w_dim1] - w[j + kw * w_dim1]);
+			a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] -
+				w[j + (kw - 1) * w_dim1]);
+/* L20: */
+		    }
+		}
+		a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
+		a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
+		a[k + k * a_dim1] = w[k + kw * w_dim1];
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    ++j;
+	}
+	++j;
+	if (jp != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    dswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+	}
+	if (j < *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	i__1 = *n - k + 1;
+	dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+	i__1 = *n - k + 1;
+	i__2 = k - 1;
+	dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k
+		+ w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+	kstep = 1;
+	absakk = (d__1 = w[k + k * w_dim1], abs(d__1));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    colmax = (d__1 = w[imax + k * w_dim1], abs(d__1));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - k;
+		dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = *n - imax + 1;
+		dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+			1) * w_dim1], &c__1);
+		i__1 = *n - k + 1;
+		i__2 = k - 1;
+		dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1],
+			lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) *
+			w_dim1], &c__1, (ftnlen)12);
+		i__1 = imax - k;
+		jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+			;
+		rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1));
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    jmax = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+/* Computing MAX */
+		    d__2 = rowmax, d__3 = (d__1 = w[jmax + (k + 1) * w_dim1],
+			    abs(d__1));
+		    rowmax = max(d__2,d__3);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else if ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) >=
+			alpha * rowmax) {
+		    kp = imax;
+		    i__1 = *n - k + 1;
+		    dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		} else {
+		    kp = imax;
+		    kstep = 2;
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kp != kk) {
+		a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
+		i__1 = kp - kk - 1;
+		dcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    dswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    r1 = 1. / a[k + k * a_dim1];
+		    i__1 = *n - k;
+		    dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    d21 = w[k + 1 + k * w_dim1];
+		    d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
+		    d22 = w[k + k * w_dim1] / d21;
+		    t = 1. / (d11 * d22 - 1.);
+		    d21 = t / d21;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] -
+				w[j + (k + 1) * w_dim1]);
+			a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) *
+				 w_dim1] - w[j + k * w_dim1]);
+/* L80: */
+		    }
+		}
+		a[k + k * a_dim1] = w[k + k * w_dim1];
+		a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
+		a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    --j;
+	}
+	--j;
+	if (jp != jj && j >= 1) {
+	    dswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j > 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/dsytrf_rook.c b/relapack/src/dsytrf_rook.c
new file mode 100644
index 000000000..19a875c7a
--- /dev/null
+++ b/relapack/src/dsytrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *,
+    double *, const int *, int *, double *, const int *, int *);
+
+
+/** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's dsytrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html
+ * */
+void RELAPACK_dsytrf_rook(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * sizeof(double));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DSYTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** dsytrf_rook's recursive compute kernel */
+static void RELAPACK_dsytrf_rook_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_dsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+    const int    iONE[] = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = DREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        double *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2   = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        double *const A_BL   = A             + n1;
+        double *const A_BR   = A + *ldA * n1 + n1;
+        double *const A_BL_B = A             + *n;
+        double *const A_BR_B = A + *ldA * n1 + *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        double *const Work_BL =              Work                + n1;
+        double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
+
+            // last row of A_BL
+            double *const A_BL_b = A_BL + n2_out;
+
+            // last row of Work_BL
+            double *const Work_BL_b = Work_BL + n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info  = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = DREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        double *const Work_R = top ? Work : Work + *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        double *const A_TL_T = A + *ldA * n_rest;
+        double *const A_TR_T = A + *ldA * (n_rest + n1);
+        double *const A_TL   = A + *ldA * n_rest        + n_rest;
+        double *const A_TR   = A + *ldA * (n_rest + n1) + n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        double *const Work_L  = Work;
+        double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/dsytrf_rook_rec2.c b/relapack/src/dsytrf_rook_rec2.c
new file mode 100644
index 000000000..105ef5ed3
--- /dev/null
+++ b/relapack/src/dsytrf_rook_rec2.c
@@ -0,0 +1,451 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static double c_b9 = -1.;
+static double c_b10 = 1.;
+
+/** DSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's dlasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n,
+	int *nb, int *kb, double *a, int *lda, int *ipiv,
+	double *w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+    double d__1;
+
+    /* Builtin functions */
+    double sqrt(double);
+
+    /* Local variables */
+    static int j, k, p;
+    static double t, r1, d11, d12, d21, d22;
+    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static logical done;
+    static int imax, jmax;
+    static double alpha;
+    extern /* Subroutine */ int dscal_(int *, double *, double *,
+	    int *);
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int dgemv_(char *, int *, int *,
+	    double *, double *, int *, double *, int *,
+	    double *, double *, int *, ftnlen);
+    static double dtemp, sfmin;
+    static int itemp;
+    extern /* Subroutine */ int dcopy_(int *, double *, int *,
+	    double *, int *), dswap_(int *, double *, int
+	    *, double *, int *);
+    static int kstep;
+    extern double dlamch_(char *, ftnlen);
+    static double absakk;
+    extern int idamax_(int *, double *, int *);
+    static double colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.) + 1.) / 8.;
+    sfmin = dlamch_("S", (ftnlen)1);
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	p = k;
+	dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	if (k < *n) {
+	    i__1 = *n - k;
+	    dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	}
+	absakk = (d__1 = w[k + kw * w_dim1], abs(d__1));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L12:
+		dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = k - imax;
+		dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		}
+		if (imax != k) {
+		    i__1 = k - imax;
+		    jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) *
+			    w_dim1], &c__1);
+		    rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1));
+		} else {
+		    rowmax = 0.;
+		}
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    itemp = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+		    dtemp = (d__1 = w[itemp + (kw - 1) * w_dim1], abs(d__1));
+		    if (dtemp > rowmax) {
+			rowmax = dtemp;
+			jmax = itemp;
+		    }
+		}
+		if (! ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) <
+			alpha * rowmax)) {
+		    kp = imax;
+		    dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		}
+		if (! done) {
+		    goto L12;
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kstep == 2 && p != k) {
+		i__1 = k - p;
+		dcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+			a_dim1], lda);
+		dcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
+			c__1);
+		i__1 = *n - k + 1;
+		dswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
+			lda);
+		i__1 = *n - kk + 1;
+		dswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+			 ldw);
+	    }
+	    if (kp != kk) {
+		a[kp + k * a_dim1] = a[kk + k * a_dim1];
+		i__1 = k - 1 - kp;
+		dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		dcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
+			c__1);
+		i__1 = *n - kk + 1;
+		dswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
+			 lda);
+		i__1 = *n - kk + 1;
+		dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) {
+			r1 = 1. / a[k + k * a_dim1];
+			i__1 = k - 1;
+			dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    } else if (a[k + k * a_dim1] != 0.) {
+			i__1 = k - 1;
+			for (ii = 1; ii <= i__1; ++ii) {
+			    a[ii + k * a_dim1] /= a[k + k * a_dim1];
+/* L14: */
+			}
+		    }
+		}
+	    } else {
+		if (k > 2) {
+		    d12 = w[k - 1 + kw * w_dim1];
+		    d11 = w[k + kw * w_dim1] / d12;
+		    d22 = w[k - 1 + (kw - 1) * w_dim1] / d12;
+		    t = 1. / (d11 * d22 - 1.);
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) *
+				 w_dim1] - w[j + kw * w_dim1]) / d12);
+			a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] -
+				w[j + (kw - 1) * w_dim1]) / d12);
+/* L20: */
+		    }
+		}
+		a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
+		a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
+		a[k + k * a_dim1] = w[k + kw * w_dim1];
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    ++j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	++j;
+	if (jp2 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    dswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	jj = j - 1;
+	if (jp1 != jj && kstep == 2) {
+	    i__1 = *n - j + 1;
+	    dswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	if (j <= *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	p = k;
+	i__1 = *n - k + 1;
+	dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+	if (k > 1) {
+	    i__1 = *n - k + 1;
+	    i__2 = k - 1;
+	    dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, &
+		    w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, (
+		    ftnlen)12);
+	}
+	absakk = (d__1 = w[k + k * w_dim1], abs(d__1));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    colmax = (d__1 = w[imax + k * w_dim1], abs(d__1));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = *n - k + 1;
+	    dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+		    c__1);
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L72:
+		i__1 = imax - k;
+		dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = *n - imax + 1;
+		dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+			1) * w_dim1], &c__1);
+		if (k > 1) {
+		    i__1 = *n - k + 1;
+		    i__2 = k - 1;
+		    dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1]
+			    , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k
+			    + 1) * w_dim1], &c__1, (ftnlen)12);
+		}
+		if (imax != k) {
+		    i__1 = imax - k;
+		    jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+			    c__1);
+		    rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1));
+		} else {
+		    rowmax = 0.;
+		}
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    itemp = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+		    dtemp = (d__1 = w[itemp + (k + 1) * w_dim1], abs(d__1));
+		    if (dtemp > rowmax) {
+			rowmax = dtemp;
+			jmax = itemp;
+		    }
+		}
+		if (! ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) < alpha
+			* rowmax)) {
+		    kp = imax;
+		    i__1 = *n - k + 1;
+		    dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    i__1 = *n - k + 1;
+		    dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		}
+		if (! done) {
+		    goto L72;
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kstep == 2 && p != k) {
+		i__1 = p - k;
+		dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
+			lda);
+		i__1 = *n - p + 1;
+		dcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
+			c__1);
+		dswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+		dswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+	    }
+	    if (kp != kk) {
+		a[kp + k * a_dim1] = a[kk + k * a_dim1];
+		i__1 = kp - k - 1;
+		dcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
+			* a_dim1], lda);
+		i__1 = *n - kp + 1;
+		dcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
+			a_dim1], &c__1);
+		dswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) {
+			r1 = 1. / a[k + k * a_dim1];
+			i__1 = *n - k;
+			dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    } else if (a[k + k * a_dim1] != 0.) {
+			i__1 = *n;
+			for (ii = k + 1; ii <= i__1; ++ii) {
+			    a[ii + k * a_dim1] /= a[k + k * a_dim1];
+/* L74: */
+			}
+		    }
+		}
+	    } else {
+		if (k < *n - 1) {
+		    d21 = w[k + 1 + k * w_dim1];
+		    d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
+		    d22 = w[k + k * w_dim1] / d21;
+		    t = 1. / (d11 * d22 - 1.);
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[
+				j + (k + 1) * w_dim1]) / d21);
+			a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) *
+				w_dim1] - w[j + k * w_dim1]) / d21);
+/* L80: */
+		    }
+		}
+		a[k + k * a_dim1] = w[k + k * w_dim1];
+		a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
+		a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    --j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	--j;
+	if (jp2 != jj && j >= 1) {
+	    dswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	jj = j + 1;
+	if (jp1 != jj && kstep == 2) {
+	    dswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j >= 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/dtgsyl.c b/relapack/src/dtgsyl.c
new file mode 100644
index 000000000..c506926af
--- /dev/null
+++ b/relapack/src/dtgsyl.c
@@ -0,0 +1,274 @@
+#include "relapack.h"
+#include <math.h>
+
+static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *,
+    const int *, const double *, const int *, const double *, const int *,
+    double *, const int *, const double *, const int *, const double *,
+    const int *, double *, const int *, double *, double *, double *, int *,
+    int *, int *);
+
+
+/** DTGSYL solves the generalized Sylvester equation.
+ *
+ * This routine is functionally equivalent to LAPACK's dtgsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html
+ * */
+void RELAPACK_dtgsyl(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC,
+    const double *D, const int *ldD, const double *E, const int *ldE,
+    double *F, const int *ldF,
+    double *scale, double *dif,
+    double *Work, const int *lWork, int *iWork, int *info
+) {
+
+    // Parse arguments
+    const int notran = LAPACK(lsame)(trans, "N");
+    const int tran = LAPACK(lsame)(trans, "T");
+
+    // Compute work buffer size
+    int lwmin = 1;
+    if (notran && (*ijob == 1 || *ijob == 2))
+        lwmin = MAX(1, 2 * *m * *n);
+    *info = 0;
+
+    // Check arguments
+    if (!tran && !notran)
+        *info = -1;
+    else if (notran && (*ijob < 0 || *ijob > 4))
+        *info = -2;
+    else if (*m <= 0)
+        *info = -3;
+    else if (*n <= 0)
+        *info = -4;
+    else if (*ldA < MAX(1, *m))
+        *info = -6;
+    else if (*ldB < MAX(1, *n))
+        *info = -8;
+    else if (*ldC < MAX(1, *m))
+        *info = -10;
+    else if (*ldD < MAX(1, *m))
+        *info = -12;
+    else if (*ldE < MAX(1, *n))
+        *info = -14;
+    else if (*ldF < MAX(1, *m))
+        *info = -16;
+    else if (*lWork < lwmin && *lWork != -1)
+        *info = -20;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DTGSYL", &minfo);
+        return;
+    }
+
+    if (*lWork == -1) {
+        // Work size query
+        *Work = lwmin;
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleantrans = notran ? 'N' : 'T';
+
+    // Constant
+    const double ZERO[] = { 0. };
+
+    int isolve = 1;
+    int ifunc  = 0;
+    if (notran) {
+        if (*ijob >= 3) {
+            ifunc = *ijob - 2;
+            LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC);
+            LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF);
+        } else if (*ijob >= 1)
+            isolve = 2;
+    }
+
+    double scale2;
+    int iround;
+    for (iround = 1; iround <= isolve; iround++) {
+        *scale = 1;
+        double dscale = 0;
+        double dsum   = 1;
+        int pq;
+        RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
+        if (dscale != 0) {
+            if (*ijob == 1 || *ijob == 3)
+                *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
+            else
+                *dif = sqrt(pq) / (dscale * sqrt(dsum));
+        }
+        if (isolve == 2) {
+            if (iround == 1) {
+                if (notran)
+                    ifunc = *ijob;
+                scale2 = *scale;
+                LAPACK(dlacpy)("F", m, n, C, ldC, Work, m);
+                LAPACK(dlacpy)("F", m, n, F, ldF, Work + *m * *n, m);
+                LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC);
+                LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF);
+            } else {
+                LAPACK(dlacpy)("F", m, n, Work, m, C, ldC);
+                LAPACK(dlacpy)("F", m, n, Work + *m * *n, m, F, ldF);
+                *scale = scale2;
+            }
+        }
+    }
+}
+
+
+/** dtgsyl's recursive vompute kernel */
+static void RELAPACK_dtgsyl_rec(
+    const char *trans, const int *ifunc, const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC,
+    const double *D, const int *ldD, const double *E, const int *ldE,
+    double *F, const int *ldF,
+    double *scale, double *dsum, double *dscale,
+    int *iWork, int *pq, int *info
+) {
+
+    if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) {
+        // Unblocked
+        LAPACK(dtgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+    const int    iONE[] = { 1 };
+
+    // Outputs
+    double scale1[] = { 1. };
+    double scale2[] = { 1. };
+    int    info1[]  = { 0 };
+    int    info2[]  = { 0 };
+
+    if (*m > *n) {
+        // Splitting
+        int m1 = DREC_SPLIT(*m);
+        if (A[m1 + *ldA * (m1 - 1)])
+            m1++;
+        const int m2 = *m - m1;
+
+        // A_TL A_TR
+        // 0    A_BR
+        const double *const A_TL = A;
+        const double *const A_TR = A + *ldA * m1;
+        const double *const A_BR = A + *ldA * m1 + m1;
+
+        // C_T
+        // C_B
+        double *const C_T = C;
+        double *const C_B = C + m1;
+
+        // D_TL D_TR
+        // 0    D_BR
+        const double *const D_TL = D;
+        const double *const D_TR = D + *ldD * m1;
+        const double *const D_BR = D + *ldD * m1 + m1;
+
+        // F_T
+        // F_B
+        double *const F_T = F;
+        double *const F_B = F + m1;
+
+        if (*trans == 'N') {
+            // recursion(A_BR, B, C_B, D_BR, E, F_B)
+            RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1);
+            // C_T = C_T - A_TR * C_B
+            BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+            // F_T = F_T - D_TR * C_B
+            BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
+            // recursion(A_TL, B, C_T, D_TL, E, F_T)
+            RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
+            }
+        } else {
+            // recursion(A_TL, B, C_T, D_TL, E, F_T)
+            RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1);
+            // apply scale
+            if (scale1[0] != 1)
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
+            // C_B = C_B - A_TR^H * C_T
+            BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+            // C_B = C_B - D_TR^H * F_T
+            BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
+            // recursion(A_BR, B, C_B, D_BR, E, F_B)
+            RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
+            }
+        }
+    } else {
+        // Splitting
+        int n1 = DREC_SPLIT(*n);
+        if (B[n1 + *ldB * (n1 - 1)])
+            n1++;
+        const int n2 = *n - n1;
+
+        // B_TL B_TR
+        // 0    B_BR
+        const double *const B_TL = B;
+        const double *const B_TR = B + *ldB * n1;
+        const double *const B_BR = B + *ldB * n1 + n1;
+
+        // C_L C_R
+        double *const C_L = C;
+        double *const C_R = C + *ldC * n1;
+
+        // E_TL E_TR
+        // 0    E_BR
+        const double *const E_TL = E;
+        const double *const E_TR = E + *ldE * n1;
+        const double *const E_BR = E + *ldE * n1 + n1;
+
+        // F_L F_R
+        double *const F_L = F;
+        double *const F_R = F + *ldF * n1;
+
+        if (*trans == 'N') {
+            // recursion(A, B_TL, C_L, D, E_TL, F_L)
+            RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1);
+            // C_R = C_R + F_L * B_TR
+            BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
+            // F_R = F_R + F_L * E_TR
+            BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
+            // recursion(A, B_BR, C_R, D, E_BR, F_R)
+            RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
+            }
+        } else {
+            // recursion(A, B_BR, C_R, D, E_BR, F_R)
+            RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1);
+            // apply scale
+            if (scale1[0] != 1)
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
+            // F_L = F_L + C_R * B_TR
+            BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
+            // F_L = F_L + F_R * E_TR
+            BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
+            // recursion(A, B_TL, C_L, D, E_TL, F_L)
+            RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
+            }
+        }
+    }
+
+    *scale = scale1[0] * scale2[0];
+    *info  = info1[0] || info2[0];
+}
diff --git a/relapack/src/dtrsyl.c b/relapack/src/dtrsyl.c
new file mode 100644
index 000000000..c87b53ae5
--- /dev/null
+++ b/relapack/src/dtrsyl.c
@@ -0,0 +1,169 @@
+#include "relapack.h"
+
+static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *,
+    const int *, const int *, const double *, const int *, const double *,
+    const int *, double *, const int *, double *, int *);
+
+
+/** DTRSYL solves the real Sylvester matrix equation.
+ *
+ * This routine is functionally equivalent to LAPACK's dtrsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html
+ * */
+void RELAPACK_dtrsyl(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC, double *scale,
+    int *info
+) {
+
+    // Check arguments
+    const int notransA = LAPACK(lsame)(tranA, "N");
+    const int transA = LAPACK(lsame)(tranA, "T");
+    const int ctransA = LAPACK(lsame)(tranA, "C");
+    const int notransB = LAPACK(lsame)(tranB, "N");
+    const int transB = LAPACK(lsame)(tranB, "T");
+    const int ctransB = LAPACK(lsame)(tranB, "C");
+    *info = 0;
+    if (!transA && !ctransA && !notransA)
+        *info = -1;
+    else if (!transB && !ctransB && !notransB)
+        *info = -2;
+    else if (*isgn != 1 && *isgn != -1)
+        *info = -3;
+    else if (*m < 0)
+        *info = -4;
+    else if (*n < 0)
+        *info = -5;
+    else if (*ldA < MAX(1, *m))
+        *info = -7;
+    else if (*ldB < MAX(1, *n))
+        *info = -9;
+    else if (*ldC < MAX(1, *m))
+        *info = -11;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DTRSYL", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C');
+    const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C');
+
+    // Recursive kernel
+    RELAPACK_dtrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+
+
+/** dtrsyl's recursive compute kernel */
+static void RELAPACK_dtrsyl_rec(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC, double *scale,
+    int *info
+) {
+
+    if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) {
+        // Unblocked
+        RELAPACK_dtrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+    const double MSGN[] = { -*isgn };
+    const int    iONE[] = { 1 };
+
+    // Outputs
+    double scale1[] = { 1. };
+    double scale2[] = { 1. };
+    int    info1[]  = { 0 };
+    int    info2[]  = { 0 };
+
+    if (*m > *n) {
+        // Splitting
+        int m1 = DREC_SPLIT(*m);
+        if (A[m1 + *ldA * (m1 - 1)])
+            m1++;
+        const int m2 = *m - m1;
+
+        // A_TL A_TR
+        // 0    A_BR
+        const double *const A_TL = A;
+        const double *const A_TR = A + *ldA * m1;
+        const double *const A_BR = A + *ldA * m1 + m1;
+
+        // C_T
+        // C_B
+        double *const C_T = C;
+        double *const C_B = C + m1;
+
+        if (*tranA == 'N') {
+            // recusion(A_BR, B, C_B)
+            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
+            // C_T = C_T - A_TR * C_B
+            BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+            // recusion(A_TL, B, C_T)
+            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+        } else {
+            // recusion(A_TL, B, C_T)
+            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
+            // C_B = C_B - A_TR' * C_T
+            BLAS(dgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+            // recusion(A_BR, B, C_B)
+            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
+        }
+    } else {
+        // Splitting
+        int n1 = DREC_SPLIT(*n);
+        if (B[n1 + *ldB * (n1 - 1)])
+            n1++;
+        const int n2 = *n - n1;
+
+        // B_TL B_TR
+        // 0    B_BR
+        const double *const B_TL = B;
+        const double *const B_TR = B + *ldB * n1;
+        const double *const B_BR = B + *ldB * n1 + n1;
+
+        // C_L C_R
+        double *const C_L = C;
+        double *const C_R = C + *ldC * n1;
+
+        if (*tranB == 'N') {
+            // recusion(A, B_TL, C_L)
+            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
+            // C_R = C_R -/+ C_L * B_TR
+            BLAS(dgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
+            // recusion(A, B_BR, C_R)
+            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+        } else {
+            // recusion(A, B_BR, C_R)
+            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
+            // C_L = C_L -/+ C_R * B_TR'
+            BLAS(dgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
+            // recusion(A, B_TL, C_L)
+            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+        }
+    }
+
+    *scale = scale1[0] * scale2[0];
+    *info  = info1[0] || info2[0];
+}
diff --git a/relapack/src/dtrsyl_rec2.c b/relapack/src/dtrsyl_rec2.c
new file mode 100644
index 000000000..479c7f340
--- /dev/null
+++ b/relapack/src/dtrsyl_rec2.c
@@ -0,0 +1,1034 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static int c_false = FALSE_;
+static int c__2 = 2;
+static double c_b26 = 1.;
+static double c_b30 = 0.;
+static int c_true = TRUE_;
+
+int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, int *isgn, int
+	*m, int *n, double *a, int *lda, double *b, int *
+	ldb, double *c__, int *ldc, double *scale, int *info,
+	ftnlen trana_len, ftnlen tranb_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4;
+    double d__1, d__2;
+
+    /* Local variables */
+    static int j, k, l;
+    static double x[4]	/* was [2][2] */;
+    static int k1, k2, l1, l2;
+    static double a11, db, da11, vec[4]	/* was [2][2] */, dum[1], eps,
+	     sgn;
+    extern double ddot_(int *, double *, int *, double *,
+	    int *);
+    static int ierr;
+    static double smin, suml, sumr;
+    extern /* Subroutine */ int dscal_(int *, double *, double *,
+	    int *);
+    extern int lsame_(char *, char *, ftnlen, ftnlen);
+    static int knext, lnext;
+    static double xnorm;
+    extern /* Subroutine */ int dlaln2_(int *, int *, int *,
+	    double *, double *, double *, int *, double *,
+	     double *, double *, int *, double *, double *
+	    , double *, int *, double *, double *, int *),
+	     dlasy2_(int *, int *, int *, int *, int *,
+	    double *, int *, double *, int *, double *,
+	    int *, double *, double *, int *, double *,
+	    int *), dlabad_(double *, double *);
+    extern double dlamch_(char *, ftnlen), dlange_(char *, int *,
+	    int *, double *, int *, double *, ftnlen);
+    static double scaloc;
+    extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
+    static double bignum;
+    static int notrna, notrnb;
+    static double smlnum;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
+    notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
+    *info = 0;
+    if (! notrna && ! lsame_(trana, "T", (ftnlen)1, (ftnlen)1) && ! lsame_(
+	    trana, "C", (ftnlen)1, (ftnlen)1)) {
+	*info = -1;
+    } else if (! notrnb && ! lsame_(tranb, "T", (ftnlen)1, (ftnlen)1) && !
+	    lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
+	*info = -2;
+    } else if (*isgn != 1 && *isgn != -1) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,*m)) {
+	*info = -7;
+    } else if (*ldb < max(1,*n)) {
+	*info = -9;
+    } else if (*ldc < max(1,*m)) {
+	*info = -11;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DTRSYL", &i__1, (ftnlen)6);
+	return 0;
+    }
+    *scale = 1.;
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+    eps = dlamch_("P", (ftnlen)1);
+    smlnum = dlamch_("S", (ftnlen)1);
+    bignum = 1. / smlnum;
+    dlabad_(&smlnum, &bignum);
+    smlnum = smlnum * (double) (*m * *n) / eps;
+    bignum = 1. / smlnum;
+/* Computing MAX */
+    d__1 = smlnum, d__2 = eps * dlange_("M", m, m, &a[a_offset], lda, dum, (
+	    ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * dlange_("M", n, n,
+	    &b[b_offset], ldb, dum, (ftnlen)1);
+    smin = max(d__1,d__2);
+    sgn = (double) (*isgn);
+    if (notrna && notrnb) {
+	lnext = 1;
+	i__1 = *n;
+	for (l = 1; l <= i__1; ++l) {
+	    if (l < lnext) {
+		goto L60;
+	    }
+	    if (l == *n) {
+		l1 = l;
+		l2 = l;
+	    } else {
+		if (b[l + 1 + l * b_dim1] != 0.) {
+		    l1 = l;
+		    l2 = l + 1;
+		    lnext = l + 2;
+		} else {
+		    l1 = l;
+		    l2 = l;
+		    lnext = l + 1;
+		}
+	    }
+	    knext = *m;
+	    for (k = *m; k >= 1; --k) {
+		if (k > knext) {
+		    goto L50;
+		}
+		if (k == 1) {
+		    k1 = k;
+		    k2 = k;
+		} else {
+		    if (a[k + (k - 1) * a_dim1] != 0.) {
+			k1 = k - 1;
+			k2 = k;
+			knext = k - 2;
+		    } else {
+			k1 = k;
+			k2 = k;
+			knext = k - 1;
+		    }
+		}
+		if (l1 == l2 && k1 == k2) {
+		    i__2 = *m - k1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+/* Computing MIN */
+		    i__4 = k1 + 1;
+		    suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    scaloc = 1.;
+		    a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+		    da11 = abs(a11);
+		    if (da11 <= smin) {
+			a11 = smin;
+			da11 = smin;
+			*info = 1;
+		    }
+		    db = abs(vec[0]);
+		    if (da11 < 1. && db > 1.) {
+			if (db > bignum * da11) {
+			    scaloc = 1. / db;
+			}
+		    }
+		    x[0] = vec[0] * scaloc / a11;
+		    if (scaloc != 1.) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L10: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		} else if (l1 == l2 && k1 != k2) {
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    d__1 = -sgn * b[l1 + l1 * b_dim1];
+		    dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1
+			    * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1,
+			     &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L20: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 == k2) {
+		    i__2 = *m - k1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+/* Computing MIN */
+		    i__4 = k1 + 1;
+		    suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+			    sumr));
+		    i__2 = *m - k1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+/* Computing MIN */
+		    i__4 = k1 + 1;
+		    suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+			    sumr));
+		    d__1 = -sgn * a[k1 + k1 * a_dim1];
+		    dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 *
+			     b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1,
+			    &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L30: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 != k2) {
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+		    dlasy2_(&c_false, &c_false, isgn, &c__2, &c__2, &a[k1 +
+			    k1 * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec,
+			     &c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L40: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[2];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		    c__[k2 + l2 * c_dim1] = x[3];
+		}
+L50:
+		;
+	    }
+L60:
+	    ;
+	}
+    } else if (! notrna && notrnb) {
+	lnext = 1;
+	i__1 = *n;
+	for (l = 1; l <= i__1; ++l) {
+	    if (l < lnext) {
+		goto L120;
+	    }
+	    if (l == *n) {
+		l1 = l;
+		l2 = l;
+	    } else {
+		if (b[l + 1 + l * b_dim1] != 0.) {
+		    l1 = l;
+		    l2 = l + 1;
+		    lnext = l + 2;
+		} else {
+		    l1 = l;
+		    l2 = l;
+		    lnext = l + 1;
+		}
+	    }
+	    knext = 1;
+	    i__2 = *m;
+	    for (k = 1; k <= i__2; ++k) {
+		if (k < knext) {
+		    goto L110;
+		}
+		if (k == *m) {
+		    k1 = k;
+		    k2 = k;
+		} else {
+		    if (a[k + 1 + k * a_dim1] != 0.) {
+			k1 = k;
+			k2 = k + 1;
+			knext = k + 2;
+		    } else {
+			k1 = k;
+			k2 = k;
+			knext = k + 1;
+		    }
+		}
+		if (l1 == l2 && k1 == k2) {
+		    i__3 = k1 - 1;
+		    suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    scaloc = 1.;
+		    a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+		    da11 = abs(a11);
+		    if (da11 <= smin) {
+			a11 = smin;
+			da11 = smin;
+			*info = 1;
+		    }
+		    db = abs(vec[0]);
+		    if (da11 < 1. && db > 1.) {
+			if (db > bignum * da11) {
+			    scaloc = 1. / db;
+			}
+		    }
+		    x[0] = vec[0] * scaloc / a11;
+		    if (scaloc != 1.) {
+			i__3 = *n;
+			for (j = 1; j <= i__3; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L70: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		} else if (l1 == l2 && k1 != k2) {
+		    i__3 = k1 - 1;
+		    suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__3 = k1 - 1;
+		    suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    d__1 = -sgn * b[l1 + l1 * b_dim1];
+		    dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 *
+			     a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1,
+			    &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__3 = *n;
+			for (j = 1; j <= i__3; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L80: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 == k2) {
+		    i__3 = k1 - 1;
+		    suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+			    sumr));
+		    i__3 = k1 - 1;
+		    suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+			    sumr));
+		    d__1 = -sgn * a[k1 + k1 * a_dim1];
+		    dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 *
+			     b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1,
+			    &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__3 = *n;
+			for (j = 1; j <= i__3; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L90: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 != k2) {
+		    i__3 = k1 - 1;
+		    suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__3 = k1 - 1;
+		    suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+		    i__3 = k1 - 1;
+		    suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__3 = k1 - 1;
+		    suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+		    dlasy2_(&c_true, &c_false, isgn, &c__2, &c__2, &a[k1 + k1
+			    * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+			    c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__3 = *n;
+			for (j = 1; j <= i__3; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L100: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[2];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		    c__[k2 + l2 * c_dim1] = x[3];
+		}
+L110:
+		;
+	    }
+L120:
+	    ;
+	}
+    } else if (! notrna && ! notrnb) {
+	lnext = *n;
+	for (l = *n; l >= 1; --l) {
+	    if (l > lnext) {
+		goto L180;
+	    }
+	    if (l == 1) {
+		l1 = l;
+		l2 = l;
+	    } else {
+		if (b[l + (l - 1) * b_dim1] != 0.) {
+		    l1 = l - 1;
+		    l2 = l;
+		    lnext = l - 2;
+		} else {
+		    l1 = l;
+		    l2 = l;
+		    lnext = l - 1;
+		}
+	    }
+	    knext = 1;
+	    i__1 = *m;
+	    for (k = 1; k <= i__1; ++k) {
+		if (k < knext) {
+		    goto L170;
+		}
+		if (k == *m) {
+		    k1 = k;
+		    k2 = k;
+		} else {
+		    if (a[k + 1 + k * a_dim1] != 0.) {
+			k1 = k;
+			k2 = k + 1;
+			knext = k + 2;
+		    } else {
+			k1 = k;
+			k2 = k;
+			knext = k + 1;
+		    }
+		}
+		if (l1 == l2 && k1 == k2) {
+		    i__2 = k1 - 1;
+		    suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l1;
+/* Computing MIN */
+		    i__3 = l1 + 1;
+/* Computing MIN */
+		    i__4 = l1 + 1;
+		    sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    scaloc = 1.;
+		    a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+		    da11 = abs(a11);
+		    if (da11 <= smin) {
+			a11 = smin;
+			da11 = smin;
+			*info = 1;
+		    }
+		    db = abs(vec[0]);
+		    if (da11 < 1. && db > 1.) {
+			if (db > bignum * da11) {
+			    scaloc = 1. / db;
+			}
+		    }
+		    x[0] = vec[0] * scaloc / a11;
+		    if (scaloc != 1.) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L130: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		} else if (l1 == l2 && k1 != k2) {
+		    i__2 = k1 - 1;
+		    suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = k1 - 1;
+		    suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    d__1 = -sgn * b[l1 + l1 * b_dim1];
+		    dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 *
+			     a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1,
+			    &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L140: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 == k2) {
+		    i__2 = k1 - 1;
+		    suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+			    sumr));
+		    i__2 = k1 - 1;
+		    suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__4,*n) * b_dim1], ldb);
+		    vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+			    sumr));
+		    d__1 = -sgn * a[k1 + k1 * a_dim1];
+		    dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1
+			    * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1,
+			     &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L150: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 != k2) {
+		    i__2 = k1 - 1;
+		    suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = k1 - 1;
+		    suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__4,*n) * b_dim1], ldb);
+		    vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = k1 - 1;
+		    suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = k1 - 1;
+		    suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__4,*n) * b_dim1], ldb);
+		    vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+		    dlasy2_(&c_true, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 *
+			     a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+			    c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L160: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[2];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		    c__[k2 + l2 * c_dim1] = x[3];
+		}
+L170:
+		;
+	    }
+L180:
+	    ;
+	}
+    } else if (notrna && ! notrnb) {
+	lnext = *n;
+	for (l = *n; l >= 1; --l) {
+	    if (l > lnext) {
+		goto L240;
+	    }
+	    if (l == 1) {
+		l1 = l;
+		l2 = l;
+	    } else {
+		if (b[l + (l - 1) * b_dim1] != 0.) {
+		    l1 = l - 1;
+		    l2 = l;
+		    lnext = l - 2;
+		} else {
+		    l1 = l;
+		    l2 = l;
+		    lnext = l - 1;
+		}
+	    }
+	    knext = *m;
+	    for (k = *m; k >= 1; --k) {
+		if (k > knext) {
+		    goto L230;
+		}
+		if (k == 1) {
+		    k1 = k;
+		    k2 = k;
+		} else {
+		    if (a[k + (k - 1) * a_dim1] != 0.) {
+			k1 = k - 1;
+			k2 = k;
+			knext = k - 2;
+		    } else {
+			k1 = k;
+			k2 = k;
+			knext = k - 1;
+		    }
+		}
+		if (l1 == l2 && k1 == k2) {
+		    i__1 = *m - k1;
+/* Computing MIN */
+		    i__2 = k1 + 1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+		    suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l1;
+/* Computing MIN */
+		    i__2 = l1 + 1;
+/* Computing MIN */
+		    i__3 = l1 + 1;
+		    sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    scaloc = 1.;
+		    a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+		    da11 = abs(a11);
+		    if (da11 <= smin) {
+			a11 = smin;
+			da11 = smin;
+			*info = 1;
+		    }
+		    db = abs(vec[0]);
+		    if (da11 < 1. && db > 1.) {
+			if (db > bignum * da11) {
+			    scaloc = 1. / db;
+			}
+		    }
+		    x[0] = vec[0] * scaloc / a11;
+		    if (scaloc != 1.) {
+			i__1 = *n;
+			for (j = 1; j <= i__1; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L190: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		} else if (l1 == l2 && k1 != k2) {
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    d__1 = -sgn * b[l1 + l1 * b_dim1];
+		    dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1
+			    * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1,
+			     &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__1 = *n;
+			for (j = 1; j <= i__1; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L200: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 == k2) {
+		    i__1 = *m - k1;
+/* Computing MIN */
+		    i__2 = k1 + 1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+		    suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+			    sumr));
+		    i__1 = *m - k1;
+/* Computing MIN */
+		    i__2 = k1 + 1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+		    suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__3,*n) * b_dim1], ldb);
+		    vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+			    sumr));
+		    d__1 = -sgn * a[k1 + k1 * a_dim1];
+		    dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1
+			    * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1,
+			     &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__1 = *n;
+			for (j = 1; j <= i__1; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L210: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 != k2) {
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__3,*n) * b_dim1], ldb);
+		    vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__3,*n) * b_dim1], ldb);
+		    vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+		    dlasy2_(&c_false, &c_true, isgn, &c__2, &c__2, &a[k1 + k1
+			    * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+			    c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.) {
+			i__1 = *n;
+			for (j = 1; j <= i__1; ++j) {
+			    dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L220: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[2];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		    c__[k2 + l2 * c_dim1] = x[3];
+		}
+L230:
+		;
+	    }
+L240:
+	    ;
+	}
+    }
+    return 0;
+}
diff --git a/relapack/src/dtrtri.c b/relapack/src/dtrtri.c
new file mode 100644
index 000000000..0462609e9
--- /dev/null
+++ b/relapack/src/dtrtri.c
@@ -0,0 +1,107 @@
+#include "relapack.h"
+
+static void RELAPACK_dtrtri_rec(const char *, const char *, const int *,
+    double *, const int *, int *);
+
+
+/** DTRTRI computes the inverse of a real upper or lower triangular matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's dtrtri.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html
+ * */
+void RELAPACK_dtrtri(
+    const char *uplo, const char *diag, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int nounit = LAPACK(lsame)(diag, "N");
+    const int unit = LAPACK(lsame)(diag, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (!nounit && !unit)
+        *info = -2;
+    else if (*n < 0)
+        *info = -3;
+    else if (*ldA < MAX(1, *n))
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("DTRTRI", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower  ? 'L' : 'U';
+    const char cleandiag = nounit ? 'N' : 'U';
+
+    // check for singularity
+    if (nounit) {
+        int i;
+        for (i = 0; i < *n; i++)
+            if (A[i + *ldA * i] == 0) {
+                *info = i;
+                return;
+            }
+    }
+
+    // Recursive kernel
+    RELAPACK_dtrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
+}
+
+
+/** dtrtri's recursive compute kernel */
+static void RELAPACK_dtrtri_rec(
+    const char *uplo, const char *diag, const int *n,
+    double *A, const int *ldA,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_DTRTRI, 1)) {
+        // Unblocked
+        LAPACK(dtrti2)(uplo, diag, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+
+    // Splitting
+    const int n1 = DREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + *ldA * n1;
+    double *const A_BL = A             + n1;
+    double *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_dtrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = - A_BL * A_TL
+        BLAS(dtrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
+        // A_BL = A_BR \ A_BL
+        BLAS(dtrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TR = - A_TL * A_TR
+        BLAS(dtrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
+        // A_TR = A_TR / A_BR
+        BLAS(dtrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_dtrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/f2c.c b/relapack/src/f2c.c
new file mode 100644
index 000000000..5a3452419
--- /dev/null
+++ b/relapack/src/f2c.c
@@ -0,0 +1,109 @@
+#include "stdlib.h"
+#include "stdio.h"
+#include "signal.h"
+#include "f2c.h"
+
+#ifndef SIGIOT
+#ifdef SIGABRT
+#define SIGIOT SIGABRT
+#endif
+#endif
+
+void sig_die(const char *s, int kill) {
+	/* print error message, then clear buffers */
+	fprintf(stderr, "%s\n", s);
+
+	if(kill) {
+		fflush(stderr);
+		/* now get a core */
+		signal(SIGIOT, SIG_DFL);
+		abort();
+	} else
+		exit(1);
+}
+
+void c_div(complex *c, complex *a, complex *b) {
+	double ratio, den;
+	double abr, abi, cr;
+
+	if( (abr = b->r) < 0.)
+		abr = - abr;
+	if( (abi = b->i) < 0.)
+		abi = - abi;
+	if( abr <= abi ) {
+		if(abi == 0) {
+#ifdef IEEE_COMPLEX_DIVIDE
+			float af, bf;
+			af = bf = abr;
+			if (a->i != 0 || a->r != 0)
+				af = 1.;
+			c->i = c->r = af / bf;
+			return;
+#else
+			sig_die("complex division by zero", 1);
+#endif
+		}
+		ratio = (double)b->r / b->i ;
+		den = b->i * (1 + ratio*ratio);
+		cr = (a->r*ratio + a->i) / den;
+		c->i = (a->i*ratio - a->r) / den;
+	} else {
+		ratio = (double)b->i / b->r ;
+		den = b->r * (1 + ratio*ratio);
+		cr = (a->r + a->i*ratio) / den;
+		c->i = (a->i - a->r*ratio) / den;
+    }
+	c->r = cr;
+}
+
+void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) {
+	double ratio, den;
+	double abr, abi, cr;
+
+	if( (abr = b->r) < 0.)
+		abr = - abr;
+	if( (abi = b->i) < 0.)
+		abi = - abi;
+	if( abr <= abi ) {
+		if(abi == 0) {
+#ifdef IEEE_COMPLEX_DIVIDE
+			if (a->i != 0 || a->r != 0)
+				abi = 1.;
+			c->i = c->r = abi / abr;
+			return;
+#else
+			sig_die("complex division by zero", 1);
+#endif
+        }
+		ratio = b->r / b->i ;
+		den = b->i * (1 + ratio*ratio);
+		cr = (a->r*ratio + a->i) / den;
+		c->i = (a->i*ratio - a->r) / den;
+	} else {
+		ratio = b->i / b->r ;
+		den = b->r * (1 + ratio*ratio);
+		cr = (a->r + a->i*ratio) / den;
+		c->i = (a->i - a->r*ratio) / den;
+    }
+	c->r = cr;
+}
+
+float r_imag(complex *z) {
+    return z->i;
+}
+
+void r_cnjg(complex *r, complex *z) {
+	float zi = z->i;
+	r->r = z->r;
+	r->i = -zi;
+}
+
+double d_imag(doublecomplex *z) {
+    return z->i;
+}
+
+void d_cnjg(doublecomplex *r, doublecomplex *z) {
+	double zi = z->i;
+	r->r = z->r;
+	r->i = -zi;
+}
diff --git a/relapack/src/f2c.h b/relapack/src/f2c.h
new file mode 100644
index 000000000..b94ee7c8e
--- /dev/null
+++ b/relapack/src/f2c.h
@@ -0,0 +1,223 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+typedef long int integer;
+typedef unsigned long int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+typedef long int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+#ifdef INTEGER_STAR_8	/* Adjust for integer*8. */
+typedef long long longint;		/* system-dependent */
+typedef unsigned long long ulongint;	/* system-dependent */
+#define qbit_clear(a,b)	((a) & ~((ulongint)1 << (b)))
+#define qbit_set(a,b)	((a) |  ((ulongint)1 << (b)))
+#endif
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+#ifdef f2c_i2
+/* for -i2 */
+typedef short flag;
+typedef short ftnlen;
+typedef short ftnint;
+#else
+typedef long int flag;
+typedef long int ftnlen;
+typedef long int ftnint;
+#endif
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+/*typedef long int Long;*/	/* No longer used; formerly in Namelist */
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (doublereal)abs(x)
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (doublereal)min(a,b)
+#define dmax(a,b) (doublereal)max(a,b)
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef int /* Unknown procedure type */ (*U_fp)(...);
+typedef shortint (*J_fp)(...);
+typedef integer (*I_fp)(...);
+typedef real (*R_fp)(...);
+typedef doublereal (*D_fp)(...), (*E_fp)(...);
+typedef /* Complex */ VOID (*C_fp)(...);
+typedef /* Double Complex */ VOID (*Z_fp)(...);
+typedef logical (*L_fp)(...);
+typedef shortlogical (*K_fp)(...);
+typedef /* Character */ VOID (*H_fp)(...);
+typedef /* Subroutine */ int (*S_fp)(...);
+#else
+typedef int /* Unknown procedure type */ (*U_fp)();
+typedef shortint (*J_fp)();
+typedef integer (*I_fp)();
+typedef real (*R_fp)();
+typedef doublereal (*D_fp)(), (*E_fp)();
+typedef /* Complex */ VOID (*C_fp)();
+typedef /* Double Complex */ VOID (*Z_fp)();
+typedef logical (*L_fp)();
+typedef shortlogical (*K_fp)();
+typedef /* Character */ VOID (*H_fp)();
+typedef /* Subroutine */ int (*S_fp)();
+#endif
+/* E_fp is for real functions when -R is not specified */
+typedef VOID C_f;	/* complex function */
+typedef VOID H_f;	/* character function */
+typedef VOID Z_f;	/* double complex function */
+typedef doublereal E_f;	/* real function with -R not specified */
+
+/* undef any lower-case symbols that your C compiler predefines, e.g.: */
+
+#ifndef Skip_f2c_Undefs
+#undef cray
+#undef gcos
+#undef mc68010
+#undef mc68020
+#undef mips
+#undef pdp11
+#undef sgi
+#undef sparc
+#undef sun
+#undef sun2
+#undef sun3
+#undef sun4
+#undef u370
+#undef u3b
+#undef u3b2
+#undef u3b5
+#undef unix
+#undef vax
+#endif
+#endif
diff --git a/relapack/src/lapack.h b/relapack/src/lapack.h
new file mode 100644
index 000000000..064276b7e
--- /dev/null
+++ b/relapack/src/lapack.h
@@ -0,0 +1,80 @@
+#ifndef LAPACK_H
+#define LAPACK_H
+
+extern int LAPACK(lsame)(const char *, const char *);
+extern int LAPACK(xerbla)(const char *, const int *);
+
+extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
+extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
+extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
+extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
+
+extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
+extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
+extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
+extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
+
+extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
+extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
+extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
+extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
+
+extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
+extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
+extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
+extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
+
+extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *);
+extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *);
+extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *);
+extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *);
+
+extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+
+extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *);
+extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *);
+extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *);
+extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *);
+
+extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *);
+extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *);
+extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *);
+extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *);
+
+extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *);
+extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
+extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *);
+extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
+
+extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *);
+
+extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *);
+
+extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+
+extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *);
+extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *);
+extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *);
+extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *);
+
+#endif /* LAPACK_H */
diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c
new file mode 100644
index 000000000..488547260
--- /dev/null
+++ b/relapack/src/lapack_wrappers.c
@@ -0,0 +1,607 @@
+#include "relapack.h"
+
+////////////
+// XLAUUM //
+////////////
+
+#if INCLUDE_SLAUUM
+void LAPACK(slauum)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_slauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DLAUUM
+void LAPACK(dlauum)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_dlauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CLAUUM
+void LAPACK(clauum)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_clauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZLAUUM
+void LAPACK(zlauum)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_zlauum(uplo, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XSYGST //
+////////////
+
+#if INCLUDE_SSYGST
+void LAPACK(ssygst)(
+    const int *itype, const char *uplo, const int *n,
+    float *A, const int *ldA, const float *B, const int *ldB,
+    int *info
+) {
+    RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_DSYGST
+void LAPACK(dsygst)(
+    const int *itype, const char *uplo, const int *n,
+    double *A, const int *ldA, const double *B, const int *ldB,
+    int *info
+) {
+    RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_CHEGST
+void LAPACK(chegst)(
+    const int *itype, const char *uplo, const int *n,
+    float *A, const int *ldA, const float *B, const int *ldB,
+    int *info
+) {
+    RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_ZHEGST
+void LAPACK(zhegst)(
+    const int *itype, const char *uplo, const int *n,
+    double *A, const int *ldA, const double *B, const int *ldB,
+    int *info
+) {
+    RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+
+////////////
+// XTRTRI //
+////////////
+
+#if INCLUDE_STRTRI
+void LAPACK(strtri)(
+    const char *uplo, const char *diag, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_strtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DTRTRI
+void LAPACK(dtrtri)(
+    const char *uplo, const char *diag, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CTRTRI
+void LAPACK(ctrtri)(
+    const char *uplo, const char *diag, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZTRTRI
+void LAPACK(ztrtri)(
+    const char *uplo, const char *diag, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XPOTRF //
+////////////
+
+#if INCLUDE_SPOTRF
+void LAPACK(spotrf)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_spotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DPOTRF
+void LAPACK(dpotrf)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_dpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CPOTRF
+void LAPACK(cpotrf)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_cpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZPOTRF
+void LAPACK(zpotrf)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_zpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XPBTRF //
+////////////
+
+#if INCLUDE_SPBTRF
+void LAPACK(spbtrf)(
+    const char *uplo, const int *n, const int *kd,
+    float *Ab, const int *ldAb,
+    int *info
+) {
+    RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_DPBTRF
+void LAPACK(dpbtrf)(
+    const char *uplo, const int *n, const int *kd,
+    double *Ab, const int *ldAb,
+    int *info
+) {
+    RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_CPBTRF
+void LAPACK(cpbtrf)(
+    const char *uplo, const int *n, const int *kd,
+    float *Ab, const int *ldAb,
+    int *info
+) {
+    RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_ZPBTRF
+void LAPACK(zpbtrf)(
+    const char *uplo, const int *n, const int *kd,
+    double *Ab, const int *ldAb,
+    int *info
+) {
+    RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+
+////////////
+// XSYTRF //
+////////////
+
+#if INCLUDE_SSYTRF
+void LAPACK(ssytrf)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_DSYTRF
+void LAPACK(dsytrf)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CSYTRF
+void LAPACK(csytrf)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZSYTRF
+void LAPACK(zsytrf)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CHETRF
+void LAPACK(chetrf)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZHETRF
+void LAPACK(zhetrf)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_SSYTRF_ROOK
+void LAPACK(ssytrf_rook)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_DSYTRF_ROOK
+void LAPACK(dsytrf_rook)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CSYTRF_ROOK
+void LAPACK(csytrf_rook)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZSYTRF_ROOK
+void LAPACK(zsytrf_rook)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CHETRF_ROOK
+void LAPACK(chetrf_rook)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZHETRF_ROOK
+void LAPACK(zhetrf_rook)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+
+////////////
+// XGETRF //
+////////////
+
+#if INCLUDE_SGETRF
+void LAPACK(sgetrf)(
+    const int *m, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    int *info
+) {
+    RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_DGETRF
+void LAPACK(dgetrf)(
+    const int *m, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    int *info
+) {
+    RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_CGETRF
+void LAPACK(cgetrf)(
+    const int *m, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    int *info
+) {
+    RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_ZGETRF
+void LAPACK(zgetrf)(
+    const int *m, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    int *info
+) {
+    RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+
+////////////
+// XGBTRF //
+////////////
+
+#if INCLUDE_SGBTRF
+void LAPACK(sgbtrf)(
+    const int *m, const int *n, const int *kl, const int *ku,
+    float *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+    RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_DGBTRF
+void LAPACK(dgbtrf)(
+    const int *m, const int *n, const int *kl, const int *ku,
+    double *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+    RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_CGBTRF
+void LAPACK(cgbtrf)(
+    const int *m, const int *n, const int *kl, const int *ku,
+    float *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+    RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_ZGBTRF
+void LAPACK(zgbtrf)(
+    const int *m, const int *n, const int *kl, const int *ku,
+    double *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+    RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+
+////////////
+// XTRSYL //
+////////////
+
+#if INCLUDE_STRSYL
+void LAPACK(strsyl)(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC, float *scale,
+    int *info
+) {
+    RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_DTRSYL
+void LAPACK(dtrsyl)(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC, double *scale,
+    int *info
+) {
+    RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_CTRSYL
+void LAPACK(ctrsyl)(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC, float *scale,
+    int *info
+) {
+    RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_ZTRSYL
+void LAPACK(ztrsyl)(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC, double *scale,
+    int *info
+) {
+    RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+
+////////////
+// XTGSYL //
+////////////
+
+#if INCLUDE_STGSYL
+void LAPACK(stgsyl)(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC,
+    const float *D, const int *ldD, const float *E, const int *ldE,
+    float *F, const int *ldF,
+    float *scale, float *dif,
+    float *Work, const int *lWork, int *iWork, int *info
+) {
+    RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_DTGSYL
+void LAPACK(dtgsyl)(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC,
+    const double *D, const int *ldD, const double *E, const int *ldE,
+    double *F, const int *ldF,
+    double *scale, double *dif,
+    double *Work, const int *lWork, int *iWork, int *info
+) {
+    RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_CTGSYL
+void LAPACK(ctgsyl)(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC,
+    const float *D, const int *ldD, const float *E, const int *ldE,
+    float *F, const int *ldF,
+    float *scale, float *dif,
+    float *Work, const int *lWork, int *iWork, int *info
+) {
+    RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_ZTGSYL
+void LAPACK(ztgsyl)(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC,
+    const double *D, const int *ldD, const double *E, const int *ldE,
+    double *F, const int *ldF,
+    double *scale, double *dif,
+    double *Work, const int *lWork, int *iWork, int *info
+) {
+    RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+
+////////////
+// XGEMMT //
+////////////
+
+#if INCLUDE_SGEMMT
+void LAPACK(sgemmt)(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+    RELAPACK_sgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DGEMMT
+void LAPACK(dgemmt)(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+    RELAPACK_dgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CGEMMT
+void LAPACK(cgemmt)(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+    RELAPACK_cgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZGEMMT
+void LAPACK(zgemmt)(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+    RELAPACK_zgemmt(uplo, n, A, ldA, info);
+}
+#endif
diff --git a/relapack/src/lapack_wrappers.c.orig b/relapack/src/lapack_wrappers.c.orig
new file mode 100644
index 000000000..d89d2fe2f
--- /dev/null
+++ b/relapack/src/lapack_wrappers.c.orig
@@ -0,0 +1,607 @@
+#include "relapack.h"
+
+////////////
+// XLAUUM //
+////////////
+
+#if INCLUDE_SLAUUM
+void LAPACK(slauum)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_slauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DLAUUM
+void LAPACK(dlauum)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_dlauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CLAUUM
+void LAPACK(clauum)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_clauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZLAUUM
+void LAPACK(zlauum)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_zlauum(uplo, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XSYGST //
+////////////
+
+#if INCLUDE_SSYGST
+void LAPACK(ssygst)(
+    const int *itype, const char *uplo, const int *n,
+    float *A, const int *ldA, const float *B, const int *ldB,
+    int *info
+) {
+    RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_DSYGST
+void LAPACK(dsygst)(
+    const int *itype, const char *uplo, const int *n,
+    double *A, const int *ldA, const double *B, const int *ldB,
+    int *info
+) {
+    RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_CSYGST
+void LAPACK(csygst)(
+    const int *itype, const char *uplo, const int *n,
+    float *A, const int *ldA, const float *B, const int *ldB,
+    int *info
+) {
+    RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_ZSYGST
+void LAPACK(zsygst)(
+    const int *itype, const char *uplo, const int *n,
+    double *A, const int *ldA, const double *B, const int *ldB,
+    int *info
+) {
+    RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+
+////////////
+// XTRTRI //
+////////////
+
+#if INCLUDE_STRTRI
+void LAPACK(strtri)(
+    const char *uplo, const char *diag, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_strtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DTRTRI
+void LAPACK(dtrtri)(
+    const char *uplo, const char *diag, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CTRTRI
+void LAPACK(ctrtri)(
+    const char *uplo, const char *diag, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZTRTRI
+void LAPACK(ztrtri)(
+    const char *uplo, const char *diag, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XPOTRF //
+////////////
+
+#if INCLUDE_SPOTRF
+void LAPACK(spotrf)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_spotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DPOTRF
+void LAPACK(dpotrf)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_dpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CPOTRF
+void LAPACK(cpotrf)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_cpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZPOTRF
+void LAPACK(zpotrf)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+    RELAPACK_zpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XPBTRF //
+////////////
+
+#if INCLUDE_SPBTRF
+void LAPACK(spbtrf)(
+    const char *uplo, const int *n, const int *kd,
+    float *Ab, const int *ldAb,
+    int *info
+) {
+    RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_DPBTRF
+void LAPACK(dpbtrf)(
+    const char *uplo, const int *n, const int *kd,
+    double *Ab, const int *ldAb,
+    int *info
+) {
+    RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_CPBTRF
+void LAPACK(cpbtrf)(
+    const char *uplo, const int *n, const int *kd,
+    float *Ab, const int *ldAb,
+    int *info
+) {
+    RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_ZPBTRF
+void LAPACK(zpbtrf)(
+    const char *uplo, const int *n, const int *kd,
+    double *Ab, const int *ldAb,
+    int *info
+) {
+    RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+
+////////////
+// XSYTRF //
+////////////
+
+#if INCLUDE_SSYTRF
+void LAPACK(ssytrf)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_DSYTRF
+void LAPACK(dsytrf)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CSYTRF
+void LAPACK(csytrf)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZSYTRF
+void LAPACK(zsytrf)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CHETRF
+void LAPACK(chetrf)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZHETRF
+void LAPACK(zhetrf)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_SSYTRF_ROOK
+void LAPACK(ssytrf_rook)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_DSYTRF_ROOK
+void LAPACK(dsytrf_rook)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CSYTRF_ROOK
+void LAPACK(csytrf_rook)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZSYTRF_ROOK
+void LAPACK(zsytrf_rook)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CHETRF_ROOK
+void LAPACK(chetrf_rook)(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+    RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZHETRF_ROOK
+void LAPACK(zhetrf_rook)(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+    RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+
+////////////
+// XGETRF //
+////////////
+
+#if INCLUDE_SGETRF
+void LAPACK(sgetrf)(
+    const int *m, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    int *info
+) {
+    RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_DGETRF
+void LAPACK(dgetrf)(
+    const int *m, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    int *info
+) {
+    RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_CGETRF
+void LAPACK(cgetrf)(
+    const int *m, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    int *info
+) {
+    RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_ZGETRF
+void LAPACK(zgetrf)(
+    const int *m, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    int *info
+) {
+    RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+
+////////////
+// XGBTRF //
+////////////
+
+#if INCLUDE_SGBTRF
+void LAPACK(sgbtrf)(
+    const int *m, const int *n, const int *kl, const int *ku,
+    float *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+    RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_DGBTRF
+void LAPACK(dgbtrf)(
+    const int *m, const int *n, const int *kl, const int *ku,
+    double *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+    RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_CGBTRF
+void LAPACK(cgbtrf)(
+    const int *m, const int *n, const int *kl, const int *ku,
+    float *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+    RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_ZGBTRF
+void LAPACK(zgbtrf)(
+    const int *m, const int *n, const int *kl, const int *ku,
+    double *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+    RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+
+////////////
+// XTRSYL //
+////////////
+
+#if INCLUDE_STRSYL
+void LAPACK(strsyl)(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC, float *scale,
+    int *info
+) {
+    RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_DTRSYL
+void LAPACK(dtrsyl)(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC, double *scale,
+    int *info
+) {
+    RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_CTRSYL
+void LAPACK(ctrsyl)(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC, float *scale,
+    int *info
+) {
+    RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_ZTRSYL
+void LAPACK(ztrsyl)(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC, double *scale,
+    int *info
+) {
+    RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+
+////////////
+// XTGSYL //
+////////////
+
+#if INCLUDE_STGSYL
+void LAPACK(stgsyl)(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC,
+    const float *D, const int *ldD, const float *E, const int *ldE,
+    float *F, const int *ldF,
+    float *scale, float *dif,
+    float *Work, const int *lWork, int *iWork, int *info
+) {
+    RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_DTGSYL
+void LAPACK(dtgsyl)(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC,
+    const double *D, const int *ldD, const double *E, const int *ldE,
+    double *F, const int *ldF,
+    double *scale, double *dif,
+    double *Work, const int *lWork, int *iWork, int *info
+) {
+    RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_CTGSYL
+void LAPACK(ctgsyl)(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC,
+    const float *D, const int *ldD, const float *E, const int *ldE,
+    float *F, const int *ldF,
+    float *scale, float *dif,
+    float *Work, const int *lWork, int *iWork, int *info
+) {
+    RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_ZTGSYL
+void LAPACK(ztgsyl)(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC,
+    const double *D, const int *ldD, const double *E, const int *ldE,
+    double *F, const int *ldF,
+    double *scale, double *dif,
+    double *Work, const int *lWork, int *iWork, int *info
+) {
+    RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+
+////////////
+// XGEMMT //
+////////////
+
+#if INCLUDE_SGEMMT
+void LAPACK(sgemmt)(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+    RELAPACK_sgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DGEMMT
+void LAPACK(dgemmt)(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+    RELAPACK_dgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CGEMMT
+void LAPACK(cgemmt)(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+    RELAPACK_cgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZGEMMT
+void LAPACK(zgemmt)(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+    RELAPACK_zgemmt(uplo, n, A, ldA, info);
+}
+#endif
diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h
new file mode 100644
index 000000000..2cb061c32
--- /dev/null
+++ b/relapack/src/relapack.h
@@ -0,0 +1,60 @@
+#ifndef RELAPACK_INT_H
+#define RELAPACK_INT_H
+
+#include "../config.h"
+
+#include "../inc/relapack.h"
+
+// add an underscore to BLAS routines (or not)
+#if BLAS_UNDERSCORE
+#define BLAS(routine) routine ## _
+#else
+#define BLAS(routine) routine
+#endif
+
+// add an underscore to LAPACK routines (or not)
+#if LAPACK_UNDERSCORE
+#define LAPACK(routine) routine ## _
+#else
+#define LAPACK(routine) routine
+#endif
+
+// minimum and maximum macros
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+// REC_SPLIT(n) returns how a problem of size n is split recursively.
+// If n >= 16, we ensure that the size of at least one of the halves is
+// divisible by 8 (the cache line size in most CPUs), while both halves are
+// still as close as possible in size.
+// If n < 16 the problem is simply split in the middle. (Note that the
+// crossoversize is usually larger than 16.)
+#define SREC_SPLIT(n) ((n >= 32) ? ((n + 16) / 32) * 16 : n / 2)
+#define DREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2)
+#define CREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2)
+#define ZREC_SPLIT(n) ((n >= 8) ? ((n + 4) / 8) * 4 : n / 2)
+
+#include "lapack.h"
+#include "blas.h"
+
+// sytrf helper routines
+void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+
+// trsyl helper routines
+void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+
+#endif /*  RELAPACK_INT_H */
diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c
new file mode 100644
index 000000000..bc20e744b
--- /dev/null
+++ b/relapack/src/sgbtrf.c
@@ -0,0 +1,227 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *,
+    const int *, float *, const int *, int *, float *, const int *, float *,
+    const int *, int *);
+
+
+/** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's sgbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html
+ * */
+void RELAPACK_sgbtrf(
+    const int *m, const int *n, const int *kl, const int *ku,
+    float *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kl < 0)
+        *info = -3;
+    else if (*ku < 0)
+        *info = -4;
+    else if (*ldAb < 2 * *kl + *ku + 1)
+        *info = -6;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("SGBTRF", &minfo);
+        return;
+    }
+
+    // Constant
+    const float ZERO[] = { 0. };
+
+    // Result upper band width
+    const int kv = *ku + *kl;
+
+    // Unskewg A
+    const int ldA[] = { *ldAb - 1 };
+    float *const A = Ab + kv;
+
+    // Zero upper diagonal fill-in elements
+    int i, j;
+    for (j = 0; j < *n; j++) {
+        float *const A_j = A + *ldA * j;
+        for (i = MAX(0, j - kv); i < j - *ku; i++)
+            A_j[i] = 0.;
+    }
+
+    // Allocate work space
+    const int n1 = SREC_SPLIT(*n);
+    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+    const int nWorkl = (kv > n1) ? n1 : kv;
+    const int mWorku = (*kl > n1) ? n1 : *kl;
+    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    float *Workl = malloc(mWorkl * nWorkl * sizeof(float));
+    float *Worku = malloc(mWorku * nWorku * sizeof(float));
+    LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
+    LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
+
+    // Recursive kernel
+    RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
+
+    // Free work space
+    free(Workl);
+    free(Worku);
+}
+
+
+/** sgbtrf's recursive compute kernel */
+static void RELAPACK_sgbtrf_rec(
+    const int *m, const int *n, const int *kl, const int *ku,
+    float *Ab, const int *ldAb, int *ipiv,
+    float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_SGBTRF, 1)) {
+        // Unblocked
+        LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1. };
+    const float MONE[] = { -1. };
+    const int    iONE[] = { 1 };
+
+    // Loop iterators
+    int i, j;
+
+    // Output upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    float *const A = Ab + kv;
+
+    // Splitting
+    const int n1  = MIN(SREC_SPLIT(*n), *kl);
+    const int n2  = *n - n1;
+    const int m1  = MIN(n1, *m);
+    const int m2  = *m - m1;
+    const int mn1 = MIN(m1, n1);
+    const int mn2 = MIN(m2, n2);
+
+    // Ab_L *
+    //      Ab_BR
+    float *const Ab_L  = Ab;
+    float *const Ab_BR = Ab + *ldAb * n1;
+
+    // A_L A_R
+    float *const A_L = A;
+    float *const A_R = A + *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + *ldA * n1;
+    float *const A_BL = A             + m1;
+    float *const A_BR = A + *ldA * n1 + m1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // Banded splitting
+    const int n21 = MIN(n2, kv - n1);
+    const int n22 = MIN(n2 - n21, n1);
+    const int m21 = MIN(m2, *kl - m1);
+    const int m22 = MIN(m2 - m21, m1);
+
+    //   n1 n21  n22
+    // m *  A_Rl ARr
+    float *const A_Rl = A_R;
+    float *const A_Rr = A_R + *ldA * n21;
+
+    //     n1    n21    n22
+    // m1  *     A_TRl  A_TRr
+    // m21 A_BLt A_BRtl A_BRtr
+    // m22 A_BLb A_BRbl A_BRbr
+    float *const A_TRl  = A_TR;
+    float *const A_TRr  = A_TR + *ldA * n21;
+    float *const A_BLt  = A_BL;
+    float *const A_BLb  = A_BL              + m21;
+    float *const A_BRtl = A_BR;
+    float *const A_BRtr = A_BR + *ldA * n21;
+    float *const A_BRbl = A_BR              + m21;
+    float *const A_BRbr = A_BR + *ldA * n21 + m21;
+
+    // recursion(Ab_L, ipiv_T)
+    RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
+
+    // Workl = A_BLb
+    LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
+
+    // partially redo swaps in A_L
+    for (i = 0; i < mn1; i++) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
+            else
+                BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
+        }
+    }
+
+    // apply pivots to A_Rl
+    LAPACK(slaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
+
+    // apply pivots to A_Rr columnwise
+    for (j = 0; j < n22; j++) {
+        float *const A_Rrj = A_Rr + *ldA * j;
+        for (i = j; i < mn1; i++) {
+            const int ip = ipiv_T[i] - 1;
+            if (ip != i) {
+                const float tmp = A_Rrj[i];
+                A_Rrj[i] = A_Rr[ip];
+                A_Rrj[ip] = tmp;
+            }
+        }
+    }
+
+    // A_TRl = A_TL \ A_TRl
+    BLAS(strsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+    // Worku = A_TRr
+    LAPACK(slacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
+    // Worku = A_TL \ Worku
+    BLAS(strsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
+    // A_TRr = Worku
+    LAPACK(slacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
+    // A_BRtl = A_BRtl - A_BLt * A_TRl
+    BLAS(sgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
+    // A_BRbl = A_BRbl - Workl * A_TRl
+    BLAS(sgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
+    // A_BRtr = A_BRtr - A_BLt * Worku
+    BLAS(sgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
+    // A_BRbr = A_BRbr - Workl * Worku
+    BLAS(sgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
+
+    // partially undo swaps in A_L
+    for (i = mn1 - 1; i >= 0; i--) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
+            else
+                BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
+        }
+    }
+
+    // recursion(Ab_BR, ipiv_B)
+    RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+    if (*info)
+        *info += n1;
+    // shift pivots
+    for (i = 0; i < mn2; i++)
+        ipiv_B[i] += n1;
+}
diff --git a/relapack/src/sgemmt.c b/relapack/src/sgemmt.c
new file mode 100644
index 000000000..75f78fabd
--- /dev/null
+++ b/relapack/src/sgemmt.c
@@ -0,0 +1,165 @@
+#include "relapack.h"
+
+static void RELAPACK_sgemmt_rec(const char *, const char *, const char *,
+    const int *, const int *, const float *, const float *, const int *,
+    const float *, const int *, const float *, float *, const int *);
+
+static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *,
+    const int *, const int *, const float *, const float *, const int *,
+    const float *, const int *, const float *, float *, const int *);
+
+
+/** SGEMMT computes a matrix-matrix product with general matrices but updates
+ * only the upper or lower triangular part of the result matrix.
+ *
+ * This routine performs the same operation as the BLAS routine
+ * sgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
+ * but only updates the triangular part of C specified by uplo:
+ * If (*uplo == 'L'), only the lower triangular part of C is updated,
+ * otherwise the upper triangular part is updated.
+ * */
+void RELAPACK_sgemmt(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+
+#if HAVE_XGEMMT
+    BLAS(sgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+    return;
+#else
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int notransA = LAPACK(lsame)(transA, "N");
+    const int tranA = LAPACK(lsame)(transA, "T");
+    const int notransB = LAPACK(lsame)(transB, "N");
+    const int tranB = LAPACK(lsame)(transB, "T");
+    int info = 0;
+    if (!lower && !upper)
+        info = 1;
+    else if (!tranA && !notransA)
+        info = 2;
+    else if (!tranB && !notransB)
+        info = 3;
+    else if (*n < 0)
+        info = 4;
+    else if (*k < 0)
+        info = 5;
+    else if (*ldA < MAX(1, notransA ? *n : *k))
+        info = 8;
+    else if (*ldB < MAX(1, notransB ? *k : *n))
+        info = 10;
+    else if (*ldC < MAX(1, *n))
+        info = 13;
+    if (info) {
+        LAPACK(xerbla)("SGEMMT", &info);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+    const char cleantransA = notransA ? 'N' : 'T';
+    const char cleantransB = notransB ? 'N' : 'T';
+
+    // Recursive kernel
+    RELAPACK_sgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+#endif
+}
+
+
+/** sgemmt's recursive compute kernel */
+static void RELAPACK_sgemmt_rec(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+
+    if (*n <= MAX(CROSSOVER_SGEMMT, 1)) {
+        // Unblocked
+        RELAPACK_sgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+        return;
+    }
+
+    // Splitting
+    const int n1 = SREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_T
+    // A_B
+    const float *const A_T = A;
+    const float *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);
+
+    // B_L B_R
+    const float *const B_L = B;
+    const float *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);
+
+    // C_TL C_TR
+    // C_BL C_BR
+    float *const C_TL = C;
+    float *const C_TR = C + *ldC * n1;
+    float *const C_BL = C             + n1;
+    float *const C_BR = C + *ldC * n1 + n1;
+
+    // recursion(C_TL)
+    RELAPACK_sgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
+
+    if (*uplo == 'L')
+        // C_BL = alpha A_B B_L + beta C_BL
+        BLAS(sgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
+    else
+        // C_TR = alpha A_T B_R + beta C_TR
+        BLAS(sgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
+
+    // recursion(C_BR)
+    RELAPACK_sgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
+}
+
+
+/** sgemmt's unblocked compute kernel */
+static void RELAPACK_sgemmt_rec2(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const float *alpha, const float *A, const int *ldA,
+    const float *B, const int *ldB,
+    const float *beta, float *C, const int *ldC
+) {
+
+    const int incB = (*transB == 'N') ? 1 : *ldB;
+    const int incC = 1;
+
+    int i;
+    for (i = 0; i < *n; i++) {
+        // A_0
+        // A_i
+        const float *const A_0 = A;
+        const float *const A_i = A + ((*transA == 'N') ? i : *ldA * i);
+
+        // * B_i *
+        const float *const B_i = B + ((*transB == 'N') ? *ldB * i : i);
+
+        // * C_0i *
+        // * C_ii *
+        float *const C_0i = C + *ldC * i;
+        float *const C_ii = C + *ldC * i + i;
+
+        if (*uplo == 'L') {
+            const int nmi = *n - i;
+            if (*transA == 'N')
+                BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+            else
+                BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+        } else {
+            const int ip1 = i + 1;
+            if (*transA == 'N')
+                BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+            else
+                BLAS(sgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+        }
+    }
+}
diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c
new file mode 100644
index 000000000..284f8cff6
--- /dev/null
+++ b/relapack/src/sgetrf.c
@@ -0,0 +1,117 @@
+#include "relapack.h"
+
+static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *,
+    int *, int *);
+
+
+/** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's sgetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html
+ * */
+void RELAPACK_sgetrf(
+    const int *m, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("SGETRF", &minfo);
+        return;
+    }
+
+    const int sn = MIN(*m, *n);
+
+    RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info);
+
+    // Right remainder
+    if (*m < *n) {
+        // Constants
+        const float ONE[] = { 1. };
+        const int  iONE[] = { 1. };
+
+        // Splitting
+        const int rn = *n - *m;
+
+        // A_L A_R
+        const float *const A_L = A;
+        float *const       A_R = A + *ldA * *m;
+
+        // A_R = apply(ipiv, A_R)
+        LAPACK(slaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
+        // A_R = A_L \ A_R
+        BLAS(strsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
+    }
+}
+
+
+/** sgetrf's recursive compute kernel */
+static void RELAPACK_sgetrf_rec(
+    const int *m, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_SGETRF, 1)) {
+        // Unblocked
+        LAPACK(sgetf2)(m, n, A, ldA, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1. };
+    const float MONE[] = { -1. };
+    const int   iONE[] = { 1 };
+
+    // Splitting
+    const int n1 = SREC_SPLIT(*n);
+    const int n2 = *n - n1;
+    const int m2 = *m - n1;
+
+    // A_L A_R
+    float *const A_L = A;
+    float *const A_R = A + *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + *ldA * n1;
+    float *const A_BL = A             + n1;
+    float *const A_BR = A + *ldA * n1 + n1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // recursion(A_L, ipiv_T)
+    RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
+    // apply pivots to A_R
+    LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
+
+    // A_TR = A_TL \ A_TR
+    BLAS(strsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+    // A_BR = A_BR - A_BL * A_TR
+    BLAS(sgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
+
+    // recursion(A_BR, ipiv_B)
+    RELAPACK_sgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
+    if (*info)
+        *info += n1;
+    // apply pivots to A_BL
+    LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
+    // shift pivots
+    int i;
+    for (i = 0; i < n2; i++)
+        ipiv_B[i] += n1;
+}
diff --git a/relapack/src/slauum.c b/relapack/src/slauum.c
new file mode 100644
index 000000000..280f141b3
--- /dev/null
+++ b/relapack/src/slauum.c
@@ -0,0 +1,87 @@
+#include "relapack.h"
+
+static void RELAPACK_slauum_rec(const char *, const int *, float *,
+    const int *, int *);
+
+
+/** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
+ *
+ * This routine is functionally equivalent to LAPACK's slauum.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html
+ * */
+void RELAPACK_slauum(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("SLAUUM", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_slauum_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** slauum's recursive compute kernel */
+static void RELAPACK_slauum_rec(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_SLAUUM, 1)) {
+        // Unblocked
+        LAPACK(slauu2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[] = { 1. };
+
+    // Splitting
+    const int n1 = SREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + *ldA * n1;
+    float *const A_BL = A             + n1;
+    float *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_slauum_rec(uplo, &n1, A_TL, ldA, info);
+
+    if (*uplo == 'L') {
+        // A_TL = A_TL + A_BL' * A_BL
+        BLAS(ssyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
+        // A_BL = A_BR' * A_BL
+        BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TL = A_TL + A_TR * A_TR'
+        BLAS(ssyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
+        // A_TR = A_TR * A_BR'
+        BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_slauum_rec(uplo, &n2, A_BR, ldA, info);
+}
diff --git a/relapack/src/spbtrf.c b/relapack/src/spbtrf.c
new file mode 100644
index 000000000..ee0a5546e
--- /dev/null
+++ b/relapack/src/spbtrf.c
@@ -0,0 +1,157 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_spbtrf_rec(const char *, const int *, const int *,
+    float *, const int *, float *, const int *, int *);
+
+
+/** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's spbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html
+ * */
+void RELAPACK_spbtrf(
+    const char *uplo, const int *n, const int *kd,
+    float *Ab, const int *ldAb,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kd < 0)
+        *info = -3;
+    else if (*ldAb < *kd + 1)
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("SPBTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Constant
+    const float ZERO[] = { 0. };
+
+    // Allocate work space
+    const int n1 = SREC_SPLIT(*n);
+    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+    float *Work = malloc(mWork * nWork * sizeof(float));
+    LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
+
+    // Recursive kernel
+    RELAPACK_spbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
+
+    // Free work space
+    free(Work);
+}
+
+
+/** spbtrf's recursive compute kernel */
+static void RELAPACK_spbtrf_rec(
+    const char *uplo, const int *n, const int *kd,
+    float *Ab, const int *ldAb,
+    float *Work, const int *ldWork,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_SPBTRF, 1)) {
+        // Unblocked
+        LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1. };
+    const float MONE[] = { -1. };
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    float *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
+
+    // Splitting
+    const int n1 = MIN(SREC_SPLIT(*n), *kd);
+    const int n2 = *n - n1;
+
+    // * *
+    // * Ab_BR
+    float *const Ab_BR = Ab + *ldAb * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + *ldA * n1;
+    float *const A_BL = A             + n1;
+    float *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_spotrf(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    // Banded splitting
+    const int n21 = MIN(n2, *kd - n1);
+    const int n22 = MIN(n2 - n21, *kd);
+
+    //     n1    n21    n22
+    // n1  *     A_TRl  A_TRr
+    // n21 A_BLt A_BRtl A_BRtr
+    // n22 A_BLb A_BRbl A_BRbr
+    float *const A_TRl  = A_TR;
+    float *const A_TRr  = A_TR + *ldA * n21;
+    float *const A_BLt  = A_BL;
+    float *const A_BLb  = A_BL               + n21;
+    float *const A_BRtl = A_BR;
+    float *const A_BRtr = A_BR + *ldA * n21;
+    float *const A_BRbl = A_BR               + n21;
+    float *const A_BRbr = A_BR + *ldA * n21  + n21;
+
+    if (*uplo == 'L') {
+        // A_BLt = ABLt / A_TL'
+        BLAS(strsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
+        // A_BRtl = A_BRtl - A_BLt * A_BLt'
+        BLAS(ssyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
+        // Work = A_BLb
+        LAPACK(slacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
+        // Work = Work / A_TL'
+        BLAS(strsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRbl = A_BRbl - Work * A_BLt'
+        BLAS(sgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
+        // A_BRbr = A_BRbr - Work * Work'
+        BLAS(ssyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_BLb = Work
+        LAPACK(slacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
+    } else {
+        // A_TRl = A_TL' \ A_TRl
+        BLAS(strsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+        // A_BRtl = A_BRtl - A_TRl' * A_TRl
+        BLAS(ssyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
+        // Work = A_TRr
+        LAPACK(slacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
+        // Work = A_TL' \ Work
+        BLAS(strsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRtr = A_BRtr - A_TRl' * Work
+        BLAS(sgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
+        // A_BRbr = A_BRbr - Work' * Work
+        BLAS(ssyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_TRr = Work
+        LAPACK(slacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
+    }
+
+    // recursion(A_BR)
+    if (*kd > n1)
+        RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info);
+    else
+        RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/spotrf.c b/relapack/src/spotrf.c
new file mode 100644
index 000000000..2a609321b
--- /dev/null
+++ b/relapack/src/spotrf.c
@@ -0,0 +1,92 @@
+#include "relapack.h"
+
+static void RELAPACK_spotrf_rec(const char *, const int *, float *, 
+        const int *, int *);
+
+
+/** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's spotrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html
+ * */
+void RELAPACK_spotrf(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("SPOTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_spotrf_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** spotrf's recursive compute kernel */
+static void RELAPACK_spotrf_rec(
+    const char *uplo, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_SPOTRF, 1)) {
+        // Unblocked
+        LAPACK(spotf2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1. };
+    const float MONE[] = { -1. };
+
+    // Splitting
+    const int n1 = SREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + *ldA * n1;
+    float *const A_BL = A             + n1;
+    float *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_spotrf_rec(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = A_BL / A_TL'
+        BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
+        // A_BR = A_BR - A_BL * A_BL'
+        BLAS(ssyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
+    } else {
+        // A_TR = A_TL' \ A_TR
+        BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+        // A_BR = A_BR - A_TR' * A_TR
+        BLAS(ssyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_spotrf_rec(uplo, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/ssygst.c b/relapack/src/ssygst.c
new file mode 100644
index 000000000..7f145cdec
--- /dev/null
+++ b/relapack/src/ssygst.c
@@ -0,0 +1,212 @@
+#include "relapack.h"
+#if XSYGST_ALLOW_MALLOC
+#include "stdlib.h"
+#endif
+
+static void RELAPACK_ssygst_rec(const int *, const char *, const int *,
+    float *, const int *, const float *, const int *,
+    float *, const int *, int *);
+
+
+/** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
+ *
+ * This routine is functionally equivalent to LAPACK's ssygst.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html
+ * */
+void RELAPACK_ssygst(
+    const int *itype, const char *uplo, const int *n,
+    float *A, const int *ldA, const float *B, const int *ldB,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (*itype < 1 || *itype > 3)
+        *info = -1;
+    else if (!lower && !upper)
+        *info = -2;
+    else if (*n < 0)
+        *info = -3;
+    else if (*ldA < MAX(1, *n))
+        *info = -5;
+    else if (*ldB < MAX(1, *n))
+        *info = -7;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("SSYGST", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Allocate work space
+    float *Work = NULL;
+    int   lWork = 0;
+#if XSYGST_ALLOW_MALLOC
+    const int n1 = SREC_SPLIT(*n);
+    lWork = n1 * (*n - n1);
+    Work  = malloc(lWork * sizeof(float));
+    if (!Work)
+        lWork = 0;
+#endif
+
+    // Recursive kernel
+    RELAPACK_ssygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
+
+    // Free work space
+#if XSYGST_ALLOW_MALLOC
+    if (Work)
+        free(Work);
+#endif
+}
+
+
+/** ssygst's recursive compute kernel */
+static void RELAPACK_ssygst_rec(
+    const int *itype, const char *uplo, const int *n,
+    float *A, const int *ldA, const float *B, const int *ldB,
+    float *Work, const int *lWork, int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
+        // Unblocked
+        LAPACK(ssygs2)(itype, uplo, n, A, ldA, B, ldB, info);
+        return;
+    }
+
+    // Constants
+    const float ZERO[]  = { 0. };
+    const float ONE[]   = { 1. };
+    const float MONE[]  = { -1. };
+    const float HALF[]  = { .5 };
+    const float MHALF[] = { -.5 };
+    const int   iONE[]  = { 1 };
+
+    // Loop iterator
+    int i;
+
+    // Splitting
+    const int n1 = SREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + *ldA * n1;
+    float *const A_BL = A             + n1;
+    float *const A_BR = A + *ldA * n1 + n1;
+
+    // B_TL B_TR
+    // B_BL B_BR
+    const float *const B_TL = B;
+    const float *const B_TR = B + *ldB * n1;
+    const float *const B_BL = B             + n1;
+    const float *const B_BR = B + *ldB * n1 + n1;
+
+    // recursion(A_TL, B_TL)
+    RELAPACK_ssygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
+
+    if (*itype == 1)
+        if (*uplo == 'L') {
+            // A_BL = A_BL / B_TL'
+            BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+            if (*lWork > n2 * n1) {
+                // T = -1/2 * B_BL * A_TL
+                BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+            } else
+                // A_BL = A_BL - 1/2 B_BL * A_TL
+                BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
+            BLAS(ssyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
+            if (*lWork > n2 * n1)
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+            else
+                // A_BL = A_BL - 1/2 B_BL * A_TL
+                BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BL = B_BR \ A_BL
+            BLAS(strsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+        } else {
+            // A_TR = B_TL' \ A_TR
+            BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+            if (*lWork > n2 * n1) {
+                // T = -1/2 * A_TL * B_TR
+                BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
+                // A_TR = A_BL + T
+                for (i = 0; i < n2; i++)
+                    BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+            } else
+                // A_TR = A_TR - 1/2 A_TL * B_TR
+                BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
+            BLAS(ssyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
+            if (*lWork > n2 * n1)
+                // A_TR = A_BL + T
+                for (i = 0; i < n2; i++)
+                    BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+            else
+                // A_TR = A_TR - 1/2 A_TL * B_TR
+                BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TR = A_TR / B_BR
+            BLAS(strsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+        }
+    else
+        if (*uplo == 'L') {
+            // A_BL = A_BL * B_TL
+            BLAS(strmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+            if (*lWork > n2 * n1) {
+                // T = 1/2 * A_BR * B_BL
+                BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+            } else
+                // A_BL = A_BL + 1/2 A_BR * B_BL
+                BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
+            BLAS(ssyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
+            if (*lWork > n2 * n1)
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+            else
+                // A_BL = A_BL + 1/2 A_BR * B_BL
+                BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BL = B_BR * A_BL
+            BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+        } else {
+            // A_TR = B_TL * A_TR
+            BLAS(strmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+            if (*lWork > n2 * n1) {
+                // T = 1/2 * B_TR * A_BR
+                BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
+                // A_TR = A_TR + T
+                for (i = 0; i < n2; i++)
+                    BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+            } else
+                // A_TR = A_TR + 1/2 B_TR A_BR
+                BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
+            BLAS(ssyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
+            if (*lWork > n2 * n1)
+                // A_TR = A_TR + T
+                for (i = 0; i < n2; i++)
+                    BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+            else
+                // A_TR = A_TR + 1/2 B_TR * A_BR
+                BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TR = A_TR * B_BR
+            BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+        }
+
+    // recursion(A_BR, B_BR)
+    RELAPACK_ssygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
+}
diff --git a/relapack/src/ssytrf.c b/relapack/src/ssytrf.c
new file mode 100644
index 000000000..8a4fad9f2
--- /dev/null
+++ b/relapack/src/ssytrf.c
@@ -0,0 +1,238 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *,
+    float *, const int *, int *, float *, const int *, int *);
+
+
+/** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's ssytrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html
+ * */
+void RELAPACK_ssytrf(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * sizeof(float));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("SSYTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy arguments
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** ssytrf's recursive compute kernel */
+static void RELAPACK_ssytrf_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_SSYTRF, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_ssytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const float ONE[]  = { 1. };
+    const float MONE[] = { -1. };
+    const int   iONE[] = { 1 };
+
+    // Loop iterator
+    int i;
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = SREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        float *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        float *const A_BL   = A             + n1;
+        float *const A_BR   = A + *ldA * n1 + n1;
+        float *const A_BL_B = A             + *n;
+        float *const A_BR_B = A + *ldA * n1 + *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        float *const Work_BL =              Work                + n1;
+        float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
+
+            // last row of A_BL
+            float *const A_BL_b = A_BL + n2_out;
+
+            // last row of Work_BL
+            float *const Work_BL_b = Work_BL + n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info  = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = SREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        float *const Work_R = top ? Work : Work + *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1  = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        float *const A_TL_T = A + *ldA * n_rest;
+        float *const A_TR_T = A + *ldA * (n_rest + n1);
+        float *const A_TL   = A + *ldA * n_rest        + n_rest;
+        float *const A_TR   = A + *ldA * (n_rest + n1) + n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        float *const Work_L  = Work;
+        float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/ssytrf_rec2.c b/relapack/src/ssytrf_rec2.c
new file mode 100644
index 000000000..edc9269ec
--- /dev/null
+++ b/relapack/src/ssytrf_rec2.c
@@ -0,0 +1,351 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static float c_b8 = -1.f;
+static float c_b9 = 1.f;
+
+/** SSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's slasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int *
+	nb, int *kb, float *a, int *lda, int *ipiv, float *w,
+	int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+    float r__1, r__2, r__3;
+
+    /* Builtin functions */
+    double sqrt(double);
+
+    /* Local variables */
+    static int j, k;
+    static float t, r1, d11, d21, d22;
+    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static float alpha;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int sscal_(int *, float *, float *, int *),
+	    sgemv_(char *, int *, int *, float *, float *, int *,
+	    float *, int *, float *, float *, int *, ftnlen);
+    static int kstep;
+    extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
+	    int *), sswap_(int *, float *, int *, float *, int *
+	    );
+    static float absakk;
+    extern int isamax_(int *, float *, int *);
+    static float colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.f) + 1.f) / 8.f;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	if (k < *n) {
+	    i__1 = *n - k;
+	    sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	}
+	kstep = 1;
+	absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = k - imax;
+		scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		}
+		i__1 = k - imax;
+		jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+			 &c__1);
+		rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1));
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    jmax = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+		    r__2 = rowmax, r__3 = (r__1 = w[jmax + (kw - 1) * w_dim1],
+			     dabs(r__1));
+		    rowmax = dmax(r__2,r__3);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else if ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) >=
+			 alpha * rowmax) {
+		    kp = imax;
+		    scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		} else {
+		    kp = imax;
+		    kstep = 2;
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kp != kk) {
+		a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
+		i__1 = kk - 1 - kp;
+		scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    scopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    sswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		r1 = 1.f / a[k + k * a_dim1];
+		i__1 = k - 1;
+		sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+	    } else {
+		if (k > 2) {
+		    d21 = w[k - 1 + kw * w_dim1];
+		    d11 = w[k + kw * w_dim1] / d21;
+		    d22 = w[k - 1 + (kw - 1) * w_dim1] / d21;
+		    t = 1.f / (d11 * d22 - 1.f);
+		    d21 = t / d21;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1)
+				* w_dim1] - w[j + kw * w_dim1]);
+			a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] -
+				w[j + (kw - 1) * w_dim1]);
+/* L20: */
+		    }
+		}
+		a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
+		a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
+		a[k + k * a_dim1] = w[k + kw * w_dim1];
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    ++j;
+	}
+	++j;
+	if (jp != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    sswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+	}
+	if (j < *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	i__1 = *n - k + 1;
+	scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+	i__1 = *n - k + 1;
+	i__2 = k - 1;
+	sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k
+		+ w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+	kstep = 1;
+	absakk = (r__1 = w[k + k * w_dim1], dabs(r__1));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - k;
+		scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = *n - imax + 1;
+		scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+			1) * w_dim1], &c__1);
+		i__1 = *n - k + 1;
+		i__2 = k - 1;
+		sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1],
+			lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) *
+			w_dim1], &c__1, (ftnlen)12);
+		i__1 = imax - k;
+		jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+			;
+		rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1));
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    jmax = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+/* Computing MAX */
+		    r__2 = rowmax, r__3 = (r__1 = w[jmax + (k + 1) * w_dim1],
+			    dabs(r__1));
+		    rowmax = dmax(r__2,r__3);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else if ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) >=
+			alpha * rowmax) {
+		    kp = imax;
+		    i__1 = *n - k + 1;
+		    scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		} else {
+		    kp = imax;
+		    kstep = 2;
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kp != kk) {
+		a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
+		i__1 = kp - kk - 1;
+		scopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    sswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    r1 = 1.f / a[k + k * a_dim1];
+		    i__1 = *n - k;
+		    sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    d21 = w[k + 1 + k * w_dim1];
+		    d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
+		    d22 = w[k + k * w_dim1] / d21;
+		    t = 1.f / (d11 * d22 - 1.f);
+		    d21 = t / d21;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] -
+				w[j + (k + 1) * w_dim1]);
+			a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) *
+				 w_dim1] - w[j + k * w_dim1]);
+/* L80: */
+		    }
+		}
+		a[k + k * a_dim1] = w[k + k * w_dim1];
+		a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
+		a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    --j;
+	}
+	--j;
+	if (jp != jj && j >= 1) {
+	    sswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j > 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/ssytrf_rook.c b/relapack/src/ssytrf_rook.c
new file mode 100644
index 000000000..040df2484
--- /dev/null
+++ b/relapack/src/ssytrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *,
+    float *, const int *, int *, float *, const int *, int *);
+
+
+/** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's ssytrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html
+ * */
+void RELAPACK_ssytrf_rook(
+    const char *uplo, const int *n,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * sizeof(float));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("SSYTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** ssytrf_rook's recursive compute kernel */
+static void RELAPACK_ssytrf_rook_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    float *A, const int *ldA, int *ipiv,
+    float *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_ssytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const float ONE[]  = { 1. };
+    const float MONE[] = { -1. };
+    const int   iONE[]  = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = SREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        float *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2   = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        float *const A_BL   = A             + n1;
+        float *const A_BR   = A + *ldA * n1 + n1;
+        float *const A_BL_B = A             + *n;
+        float *const A_BR_B = A + *ldA * n1 + *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        float *const Work_BL =              Work                + n1;
+        float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
+
+            // last row of A_BL
+            float *const A_BL_b = A_BL + n2_out;
+
+            // last row of Work_BL
+            float *const Work_BL_b = Work_BL + n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info  = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = SREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        float *const Work_R = top ? Work : Work + *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        float *const A_TL_T = A + *ldA * n_rest;
+        float *const A_TR_T = A + *ldA * (n_rest + n1);
+        float *const A_TL   = A + *ldA * n_rest        + n_rest;
+        float *const A_TR   = A + *ldA * (n_rest + n1) + n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        float *const Work_L  = Work;
+        float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/ssytrf_rook_rec2.c b/relapack/src/ssytrf_rook_rec2.c
new file mode 100644
index 000000000..3308826d7
--- /dev/null
+++ b/relapack/src/ssytrf_rook_rec2.c
@@ -0,0 +1,451 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static float c_b9 = -1.f;
+static float c_b10 = 1.f;
+
+/** SSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's slasyf_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n,
+	int *nb, int *kb, float *a, int *lda, int *ipiv, float *
+	w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+    float r__1;
+
+    /* Builtin functions */
+    double sqrt(double);
+
+    /* Local variables */
+    static int j, k, p;
+    static float t, r1, d11, d12, d21, d22;
+    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static logical done;
+    static int imax, jmax;
+    static float alpha;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int sscal_(int *, float *, float *, int *);
+    static float sfmin;
+    static int itemp;
+    extern /* Subroutine */ int sgemv_(char *, int *, int *, float *,
+	    float *, int *, float *, int *, float *, float *, int *,
+	    ftnlen);
+    static int kstep;
+    static float stemp;
+    extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
+	    int *), sswap_(int *, float *, int *, float *, int *
+	    );
+    static float absakk;
+    extern double slamch_(char *, ftnlen);
+    extern int isamax_(int *, float *, int *);
+    static float colmax, rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.f) + 1.f) / 8.f;
+    sfmin = slamch_("S", (ftnlen)1);
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	p = k;
+	scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	if (k < *n) {
+	    i__1 = *n - k;
+	    sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	}
+	absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L12:
+		scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = k - imax;
+		scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		}
+		if (imax != k) {
+		    i__1 = k - imax;
+		    jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) *
+			    w_dim1], &c__1);
+		    rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1));
+		} else {
+		    rowmax = 0.f;
+		}
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    itemp = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+		    stemp = (r__1 = w[itemp + (kw - 1) * w_dim1], dabs(r__1));
+		    if (stemp > rowmax) {
+			rowmax = stemp;
+			jmax = itemp;
+		    }
+		}
+		if (! ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) <
+			alpha * rowmax)) {
+		    kp = imax;
+		    scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		}
+		if (! done) {
+		    goto L12;
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kstep == 2 && p != k) {
+		i__1 = k - p;
+		scopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+			a_dim1], lda);
+		scopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
+			c__1);
+		i__1 = *n - k + 1;
+		sswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
+			lda);
+		i__1 = *n - kk + 1;
+		sswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+			 ldw);
+	    }
+	    if (kp != kk) {
+		a[kp + k * a_dim1] = a[kk + k * a_dim1];
+		i__1 = k - 1 - kp;
+		scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		scopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
+			c__1);
+		i__1 = *n - kk + 1;
+		sswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
+			 lda);
+		i__1 = *n - kk + 1;
+		sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) {
+			r1 = 1.f / a[k + k * a_dim1];
+			i__1 = k - 1;
+			sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    } else if (a[k + k * a_dim1] != 0.f) {
+			i__1 = k - 1;
+			for (ii = 1; ii <= i__1; ++ii) {
+			    a[ii + k * a_dim1] /= a[k + k * a_dim1];
+/* L14: */
+			}
+		    }
+		}
+	    } else {
+		if (k > 2) {
+		    d12 = w[k - 1 + kw * w_dim1];
+		    d11 = w[k + kw * w_dim1] / d12;
+		    d22 = w[k - 1 + (kw - 1) * w_dim1] / d12;
+		    t = 1.f / (d11 * d22 - 1.f);
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) *
+				 w_dim1] - w[j + kw * w_dim1]) / d12);
+			a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] -
+				w[j + (kw - 1) * w_dim1]) / d12);
+/* L20: */
+		    }
+		}
+		a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
+		a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
+		a[k + k * a_dim1] = w[k + kw * w_dim1];
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    ++j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	++j;
+	if (jp2 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    sswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	jj = j - 1;
+	if (jp1 != jj && kstep == 2) {
+	    i__1 = *n - j + 1;
+	    sswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	if (j <= *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	p = k;
+	i__1 = *n - k + 1;
+	scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+	if (k > 1) {
+	    i__1 = *n - k + 1;
+	    i__2 = k - 1;
+	    sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, &
+		    w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, (
+		    ftnlen)12);
+	}
+	absakk = (r__1 = w[k + k * w_dim1], dabs(r__1));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1));
+	} else {
+	    colmax = 0.f;
+	}
+	if (dmax(absakk,colmax) == 0.f) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = *n - k + 1;
+	    scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+		    c__1);
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L72:
+		i__1 = imax - k;
+		scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = *n - imax + 1;
+		scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+			1) * w_dim1], &c__1);
+		if (k > 1) {
+		    i__1 = *n - k + 1;
+		    i__2 = k - 1;
+		    sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1]
+			    , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k
+			    + 1) * w_dim1], &c__1, (ftnlen)12);
+		}
+		if (imax != k) {
+		    i__1 = imax - k;
+		    jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+			    c__1);
+		    rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1));
+		} else {
+		    rowmax = 0.f;
+		}
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    itemp = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+		    stemp = (r__1 = w[itemp + (k + 1) * w_dim1], dabs(r__1));
+		    if (stemp > rowmax) {
+			rowmax = stemp;
+			jmax = itemp;
+		    }
+		}
+		if (! ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) <
+			alpha * rowmax)) {
+		    kp = imax;
+		    i__1 = *n - k + 1;
+		    scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    i__1 = *n - k + 1;
+		    scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		}
+		if (! done) {
+		    goto L72;
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kstep == 2 && p != k) {
+		i__1 = p - k;
+		scopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
+			lda);
+		i__1 = *n - p + 1;
+		scopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
+			c__1);
+		sswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+		sswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+	    }
+	    if (kp != kk) {
+		a[kp + k * a_dim1] = a[kk + k * a_dim1];
+		i__1 = kp - k - 1;
+		scopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
+			* a_dim1], lda);
+		i__1 = *n - kp + 1;
+		scopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
+			a_dim1], &c__1);
+		sswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) {
+			r1 = 1.f / a[k + k * a_dim1];
+			i__1 = *n - k;
+			sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    } else if (a[k + k * a_dim1] != 0.f) {
+			i__1 = *n;
+			for (ii = k + 1; ii <= i__1; ++ii) {
+			    a[ii + k * a_dim1] /= a[k + k * a_dim1];
+/* L74: */
+			}
+		    }
+		}
+	    } else {
+		if (k < *n - 1) {
+		    d21 = w[k + 1 + k * w_dim1];
+		    d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
+		    d22 = w[k + k * w_dim1] / d21;
+		    t = 1.f / (d11 * d22 - 1.f);
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[
+				j + (k + 1) * w_dim1]) / d21);
+			a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) *
+				w_dim1] - w[j + k * w_dim1]) / d21);
+/* L80: */
+		    }
+		}
+		a[k + k * a_dim1] = w[k + k * w_dim1];
+		a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
+		a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    --j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	--j;
+	if (jp2 != jj && j >= 1) {
+	    sswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	jj = j + 1;
+	if (jp1 != jj && kstep == 2) {
+	    sswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j >= 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/stgsyl.c b/relapack/src/stgsyl.c
new file mode 100644
index 000000000..1870fb928
--- /dev/null
+++ b/relapack/src/stgsyl.c
@@ -0,0 +1,274 @@
+#include "relapack.h"
+#include <math.h>
+
+static void RELAPACK_stgsyl_rec(const char *, const int *, const int *,
+    const int *, const float *, const int *, const float *, const int *,
+    float *, const int *, const float *, const int *, const float *,
+    const int *, float *, const int *, float *, float *, float *, int *, int *,
+    int *);
+
+
+/** STGSYL solves the generalized Sylvester equation.
+ *
+ * This routine is functionally equivalent to LAPACK's stgsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html
+ * */
+void RELAPACK_stgsyl(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC,
+    const float *D, const int *ldD, const float *E, const int *ldE,
+    float *F, const int *ldF,
+    float *scale, float *dif,
+    float *Work, const int *lWork, int *iWork, int *info
+) {
+
+    // Parse arguments
+    const int notran = LAPACK(lsame)(trans, "N");
+    const int tran = LAPACK(lsame)(trans, "T");
+
+    // Compute work buffer size
+    int lwmin = 1;
+    if (notran && (*ijob == 1 || *ijob == 2))
+        lwmin = MAX(1, 2 * *m * *n);
+    *info = 0;
+
+    // Check arguments
+    if (!tran && !notran)
+        *info = -1;
+    else if (notran && (*ijob < 0 || *ijob > 4))
+        *info = -2;
+    else if (*m <= 0)
+        *info = -3;
+    else if (*n <= 0)
+        *info = -4;
+    else if (*ldA < MAX(1, *m))
+        *info = -6;
+    else if (*ldB < MAX(1, *n))
+        *info = -8;
+    else if (*ldC < MAX(1, *m))
+        *info = -10;
+    else if (*ldD < MAX(1, *m))
+        *info = -12;
+    else if (*ldE < MAX(1, *n))
+        *info = -14;
+    else if (*ldF < MAX(1, *m))
+        *info = -16;
+    else if (*lWork < lwmin && *lWork != -1)
+        *info = -20;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("STGSYL", &minfo);
+        return;
+    }
+
+    if (*lWork == -1) {
+        // Work size query
+        *Work = lwmin;
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleantrans = notran ? 'N' : 'T';
+
+    // Constant
+    const float ZERO[] = { 0. };
+
+    int isolve = 1;
+    int ifunc  = 0;
+    if (notran) {
+        if (*ijob >= 3) {
+            ifunc = *ijob - 2;
+            LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC);
+            LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF);
+        } else if (*ijob >= 1)
+            isolve = 2;
+    }
+
+    float scale2;
+    int iround;
+    for (iround = 1; iround <= isolve; iround++) {
+        *scale = 1;
+        float dscale = 0;
+        float dsum   = 1;
+        int pq;
+        RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
+        if (dscale != 0) {
+            if (*ijob == 1 || *ijob == 3)
+                *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
+            else
+                *dif = sqrt(pq) / (dscale * sqrt(dsum));
+        }
+        if (isolve == 2) {
+            if (iround == 1) {
+                if (notran)
+                    ifunc = *ijob;
+                scale2 = *scale;
+                LAPACK(slacpy)("F", m, n, C, ldC, Work, m);
+                LAPACK(slacpy)("F", m, n, F, ldF, Work + *m * *n, m);
+                LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC);
+                LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF);
+            } else {
+                LAPACK(slacpy)("F", m, n, Work, m, C, ldC);
+                LAPACK(slacpy)("F", m, n, Work + *m * *n, m, F, ldF);
+                *scale = scale2;
+            }
+        }
+    }
+}
+
+
+/** stgsyl's recursive vompute kernel */
+static void RELAPACK_stgsyl_rec(
+    const char *trans, const int *ifunc, const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC,
+    const float *D, const int *ldD, const float *E, const int *ldE,
+    float *F, const int *ldF,
+    float *scale, float *dsum, float *dscale,
+    int *iWork, int *pq, int *info
+) {
+
+    if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) {
+        // Unblocked
+        LAPACK(stgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1. };
+    const float MONE[] = { -1. };
+    const int   iONE[] = { 1 };
+
+    // Outputs
+    float scale1[] = { 1. };
+    float scale2[] = { 1. };
+    int   info1[]  = { 0 };
+    int   info2[]  = { 0 };
+
+    if (*m > *n) {
+        // Splitting
+        int m1 = SREC_SPLIT(*m);
+        if (A[m1 + *ldA * (m1 - 1)])
+            m1++;
+        const int m2 = *m - m1;
+
+        // A_TL A_TR
+        // 0    A_BR
+        const float *const A_TL = A;
+        const float *const A_TR = A + *ldA * m1;
+        const float *const A_BR = A + *ldA * m1 + m1;
+
+        // C_T
+        // C_B
+        float *const C_T = C;
+        float *const C_B = C + m1;
+
+        // D_TL D_TR
+        // 0    D_BR
+        const float *const D_TL = D;
+        const float *const D_TR = D + *ldD * m1;
+        const float *const D_BR = D + *ldD * m1 + m1;
+
+        // F_T
+        // F_B
+        float *const F_T = F;
+        float *const F_B = F + m1;
+
+        if (*trans == 'N') {
+            // recursion(A_BR, B, C_B, D_BR, E, F_B)
+            RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1);
+            // C_T = C_T - A_TR * C_B
+            BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+            // F_T = F_T - D_TR * C_B
+            BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
+            // recursion(A_TL, B, C_T, D_TL, E, F_T)
+            RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
+            }
+        } else {
+            // recursion(A_TL, B, C_T, D_TL, E, F_T)
+            RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1);
+            // apply scale
+            if (scale1[0] != 1)
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
+            // C_B = C_B - A_TR^H * C_T
+            BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+            // C_B = C_B - D_TR^H * F_T
+            BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
+            // recursion(A_BR, B, C_B, D_BR, E, F_B)
+            RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
+            }
+        }
+    } else {
+        // Splitting
+        int n1 = SREC_SPLIT(*n);
+        if (B[n1 + *ldB * (n1 - 1)])
+            n1++;
+        const int n2 = *n - n1;
+
+        // B_TL B_TR
+        // 0    B_BR
+        const float *const B_TL = B;
+        const float *const B_TR = B + *ldB * n1;
+        const float *const B_BR = B + *ldB * n1 + n1;
+
+        // C_L C_R
+        float *const C_L = C;
+        float *const C_R = C + *ldC * n1;
+
+        // E_TL E_TR
+        // 0    E_BR
+        const float *const E_TL = E;
+        const float *const E_TR = E + *ldE * n1;
+        const float *const E_BR = E + *ldE * n1 + n1;
+
+        // F_L F_R
+        float *const F_L = F;
+        float *const F_R = F + *ldF * n1;
+
+        if (*trans == 'N') {
+            // recursion(A, B_TL, C_L, D, E_TL, F_L)
+            RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1);
+            // C_R = C_R + F_L * B_TR
+            BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
+            // F_R = F_R + F_L * E_TR
+            BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
+            // recursion(A, B_BR, C_R, D, E_BR, F_R)
+            RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
+            }
+        } else {
+            // recursion(A, B_BR, C_R, D, E_BR, F_R)
+            RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1);
+            // apply scale
+            if (scale1[0] != 1)
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
+            // F_L = F_L + C_R * B_TR
+            BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
+            // F_L = F_L + F_R * E_TR
+            BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
+            // recursion(A, B_TL, C_L, D, E_TL, F_L)
+            RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
+            }
+        }
+    }
+
+    *scale = scale1[0] * scale2[0];
+    *info  = info1[0] || info2[0];
+}
diff --git a/relapack/src/strsyl.c b/relapack/src/strsyl.c
new file mode 100644
index 000000000..83947ef1a
--- /dev/null
+++ b/relapack/src/strsyl.c
@@ -0,0 +1,169 @@
+#include "relapack.h"
+
+static void RELAPACK_strsyl_rec(const char *, const char *, const int *,
+    const int *, const int *, const float *, const int *, const float *,
+    const int *, float *, const int *, float *, int *);
+
+
+/** STRSYL solves the real Sylvester matrix equation.
+ *
+ * This routine is functionally equivalent to LAPACK's strsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html
+ * */
+void RELAPACK_strsyl(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC, float *scale,
+    int *info
+) {
+
+    // Check arguments
+    const int notransA = LAPACK(lsame)(tranA, "N");
+    const int transA = LAPACK(lsame)(tranA, "T");
+    const int ctransA = LAPACK(lsame)(tranA, "C");
+    const int notransB = LAPACK(lsame)(tranB, "N");
+    const int transB = LAPACK(lsame)(tranB, "T");
+    const int ctransB = LAPACK(lsame)(tranB, "C");
+    *info = 0;
+    if (!transA && !ctransA && !notransA)
+        *info = -1;
+    else if (!transB && !ctransB && !notransB)
+        *info = -2;
+    else if (*isgn != 1 && *isgn != -1)
+        *info = -3;
+    else if (*m < 0)
+        *info = -4;
+    else if (*n < 0)
+        *info = -5;
+    else if (*ldA < MAX(1, *m))
+        *info = -7;
+    else if (*ldB < MAX(1, *n))
+        *info = -9;
+    else if (*ldC < MAX(1, *m))
+        *info = -11;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("STRSYL", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C');
+    const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C');
+
+    // Recursive kernel
+    RELAPACK_strsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+
+
+/** strsyl's recursive compute kernel */
+static void RELAPACK_strsyl_rec(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const float *A, const int *ldA, const float *B, const int *ldB,
+    float *C, const int *ldC, float *scale,
+    int *info
+) {
+
+    if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) {
+        // Unblocked
+        RELAPACK_strsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1. };
+    const float MONE[] = { -1. };
+    const float MSGN[] = { -*isgn };
+    const int   iONE[] = { 1 };
+
+    // Outputs
+    float scale1[] = { 1. };
+    float scale2[] = { 1. };
+    int   info1[]  = { 0 };
+    int   info2[]  = { 0 };
+
+    if (*m > *n) {
+        // Splitting
+        int m1 = SREC_SPLIT(*m);
+        if (A[m1 + *ldA * (m1 - 1)])
+            m1++;
+        const int m2 = *m - m1;
+
+        // A_TL A_TR
+        // 0    A_BR
+        const float *const A_TL = A;
+        const float *const A_TR = A + *ldA * m1;
+        const float *const A_BR = A + *ldA * m1 + m1;
+
+        // C_T
+        // C_B
+        float *const C_T = C;
+        float *const C_B = C + m1;
+
+        if (*tranA == 'N') {
+            // recusion(A_BR, B, C_B)
+            RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
+            // C_T = C_T - A_TR * C_B
+            BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+            // recusion(A_TL, B, C_T)
+            RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+        } else {
+            // recusion(A_TL, B, C_T)
+            RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
+            // C_B = C_B - A_TR' * C_T
+            BLAS(sgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+            // recusion(A_BR, B, C_B)
+            RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
+        }
+    } else {
+        // Splitting
+        int n1 = SREC_SPLIT(*n);
+        if (B[n1 + *ldB * (n1 - 1)])
+            n1++;
+        const int n2 = *n - n1;
+
+        // B_TL B_TR
+        // 0    B_BR
+        const float *const B_TL = B;
+        const float *const B_TR = B + *ldB * n1;
+        const float *const B_BR = B + *ldB * n1 + n1;
+
+        // C_L C_R
+        float *const C_L = C;
+        float *const C_R = C + *ldC * n1;
+
+        if (*tranB == 'N') {
+            // recusion(A, B_TL, C_L)
+            RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
+            // C_R = C_R -/+ C_L * B_TR
+            BLAS(sgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
+            // recusion(A, B_BR, C_R)
+            RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+        } else {
+            // recusion(A, B_BR, C_R)
+            RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
+            // C_L = C_L -/+ C_R * B_TR'
+            BLAS(sgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
+            // recusion(A, B_TL, C_L)
+            RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+        }
+    }
+
+    *scale = scale1[0] * scale2[0];
+    *info  = info1[0] || info2[0];
+}
diff --git a/relapack/src/strsyl_rec2.c b/relapack/src/strsyl_rec2.c
new file mode 100644
index 000000000..6d40a475d
--- /dev/null
+++ b/relapack/src/strsyl_rec2.c
@@ -0,0 +1,1029 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static int c_false = FALSE_;
+static int c__2 = 2;
+static float c_b26 = 1.f;
+static float c_b30 = 0.f;
+static int c_true = TRUE_;
+
+void RELAPACK_strsyl_rec2(char *trana, char *tranb, int *isgn, int
+	*m, int *n, float *a, int *lda, float *b, int *ldb, float *
+	c__, int *ldc, float *scale, int *info, ftnlen trana_len,
+	ftnlen tranb_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4;
+    float r__1, r__2;
+
+    /* Local variables */
+    static int j, k, l;
+    static float x[4]	/* was [2][2] */;
+    static int k1, k2, l1, l2;
+    static float a11, db, da11, vec[4]	/* was [2][2] */, dum[1], eps, sgn;
+    static int ierr;
+    static float smin;
+    extern float sdot_(int *, float *, int *, float *, int *);
+    static float suml, sumr;
+    extern int lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int sscal_(int *, float *, float *, int *);
+    static int knext, lnext;
+    static float xnorm;
+    extern /* Subroutine */ int slaln2_(int *, int *, int *, float
+	    *, float *, float *, int *, float *, float *, float *, int *,
+	    float *, float *, float *, int *, float *, float *, int *),
+	    slasy2_(int *, int *, int *, int *, int *,
+	    float *, int *, float *, int *, float *, int *, float *,
+	    float *, int *, float *, int *), slabad_(float *, float *);
+    static float scaloc;
+    extern float slamch_(char *, ftnlen), slange_(char *, int *,
+	    int *, float *, int *, float *, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
+    static float bignum;
+    static int notrna, notrnb;
+    static float smlnum;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
+    notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
+    *info = 0;
+    if (! notrna && ! lsame_(trana, "T", (ftnlen)1, (ftnlen)1) && ! lsame_(
+	    trana, "C", (ftnlen)1, (ftnlen)1)) {
+	*info = -1;
+    } else if (! notrnb && ! lsame_(tranb, "T", (ftnlen)1, (ftnlen)1) && !
+	    lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
+	*info = -2;
+    } else if (*isgn != 1 && *isgn != -1) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,*m)) {
+	*info = -7;
+    } else if (*ldb < max(1,*n)) {
+	*info = -9;
+    } else if (*ldc < max(1,*m)) {
+	*info = -11;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("STRSYL", &i__1, (ftnlen)6);
+	return;
+    }
+    *scale = 1.f;
+    if (*m == 0 || *n == 0) {
+	return;
+    }
+    eps = slamch_("P", (ftnlen)1);
+    smlnum = slamch_("S", (ftnlen)1);
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
+    smlnum = smlnum * (float) (*m * *n) / eps;
+    bignum = 1.f / smlnum;
+/* Computing MAX */
+    r__1 = smlnum, r__2 = eps * slange_("M", m, m, &a[a_offset], lda, dum, (
+	    ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * slange_("M", n, n,
+	    &b[b_offset], ldb, dum, (ftnlen)1);
+    smin = dmax(r__1,r__2);
+    sgn = (float) (*isgn);
+    if (notrna && notrnb) {
+	lnext = 1;
+	i__1 = *n;
+	for (l = 1; l <= i__1; ++l) {
+	    if (l < lnext) {
+		goto L70;
+	    }
+	    if (l == *n) {
+		l1 = l;
+		l2 = l;
+	    } else {
+		if (b[l + 1 + l * b_dim1] != 0.f) {
+		    l1 = l;
+		    l2 = l + 1;
+		    lnext = l + 2;
+		} else {
+		    l1 = l;
+		    l2 = l;
+		    lnext = l + 1;
+		}
+	    }
+	    knext = *m;
+	    for (k = *m; k >= 1; --k) {
+		if (k > knext) {
+		    goto L60;
+		}
+		if (k == 1) {
+		    k1 = k;
+		    k2 = k;
+		} else {
+		    if (a[k + (k - 1) * a_dim1] != 0.f) {
+			k1 = k - 1;
+			k2 = k;
+			knext = k - 2;
+		    } else {
+			k1 = k;
+			k2 = k;
+			knext = k - 1;
+		    }
+		}
+		if (l1 == l2 && k1 == k2) {
+		    i__2 = *m - k1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+/* Computing MIN */
+		    i__4 = k1 + 1;
+		    suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    scaloc = 1.f;
+		    a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+		    da11 = dabs(a11);
+		    if (da11 <= smin) {
+			a11 = smin;
+			da11 = smin;
+			*info = 1;
+		    }
+		    db = dabs(vec[0]);
+		    if (da11 < 1.f && db > 1.f) {
+			if (db > bignum * da11) {
+			    scaloc = 1.f / db;
+			}
+		    }
+		    x[0] = vec[0] * scaloc / a11;
+		    if (scaloc != 1.f) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L10: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		} else if (l1 == l2 && k1 != k2) {
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    r__1 = -sgn * b[l1 + l1 * b_dim1];
+		    slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1
+			    * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1,
+			     &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L20: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 == k2) {
+		    i__2 = *m - k1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+/* Computing MIN */
+		    i__4 = k1 + 1;
+		    suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+			    sumr));
+		    i__2 = *m - k1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+/* Computing MIN */
+		    i__4 = k1 + 1;
+		    suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+			    sumr));
+		    r__1 = -sgn * a[k1 + k1 * a_dim1];
+		    slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 *
+			     b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1,
+			    &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L40: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 != k2) {
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = *m - k2;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+/* Computing MIN */
+		    i__4 = k2 + 1;
+		    suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+			    c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+		    i__2 = l1 - 1;
+		    sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+		    slasy2_(&c_false, &c_false, isgn, &c__2, &c__2, &a[k1 +
+			    k1 * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec,
+			     &c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L50: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[2];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		    c__[k2 + l2 * c_dim1] = x[3];
+		}
+L60:
+		;
+	    }
+L70:
+	    ;
+	}
+    } else if (! notrna && notrnb) {
+	lnext = 1;
+	i__1 = *n;
+	for (l = 1; l <= i__1; ++l) {
+	    if (l < lnext) {
+		goto L130;
+	    }
+	    if (l == *n) {
+		l1 = l;
+		l2 = l;
+	    } else {
+		if (b[l + 1 + l * b_dim1] != 0.f) {
+		    l1 = l;
+		    l2 = l + 1;
+		    lnext = l + 2;
+		} else {
+		    l1 = l;
+		    l2 = l;
+		    lnext = l + 1;
+		}
+	    }
+	    knext = 1;
+	    i__2 = *m;
+	    for (k = 1; k <= i__2; ++k) {
+		if (k < knext) {
+		    goto L120;
+		}
+		if (k == *m) {
+		    k1 = k;
+		    k2 = k;
+		} else {
+		    if (a[k + 1 + k * a_dim1] != 0.f) {
+			k1 = k;
+			k2 = k + 1;
+			knext = k + 2;
+		    } else {
+			k1 = k;
+			k2 = k;
+			knext = k + 1;
+		    }
+		}
+		if (l1 == l2 && k1 == k2) {
+		    i__3 = k1 - 1;
+		    suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    scaloc = 1.f;
+		    a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+		    da11 = dabs(a11);
+		    if (da11 <= smin) {
+			a11 = smin;
+			da11 = smin;
+			*info = 1;
+		    }
+		    db = dabs(vec[0]);
+		    if (da11 < 1.f && db > 1.f) {
+			if (db > bignum * da11) {
+			    scaloc = 1.f / db;
+			}
+		    }
+		    x[0] = vec[0] * scaloc / a11;
+		    if (scaloc != 1.f) {
+			i__3 = *n;
+			for (j = 1; j <= i__3; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L80: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		} else if (l1 == l2 && k1 != k2) {
+		    i__3 = k1 - 1;
+		    suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__3 = k1 - 1;
+		    suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    r__1 = -sgn * b[l1 + l1 * b_dim1];
+		    slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 *
+			     a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1,
+			    &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__3 = *n;
+			for (j = 1; j <= i__3; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L90: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 == k2) {
+		    i__3 = k1 - 1;
+		    suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+			    sumr));
+		    i__3 = k1 - 1;
+		    suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+			    sumr));
+		    r__1 = -sgn * a[k1 + k1 * a_dim1];
+		    slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 *
+			     b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1,
+			    &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__3 = *n;
+			for (j = 1; j <= i__3; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L100: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 != k2) {
+		    i__3 = k1 - 1;
+		    suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__3 = k1 - 1;
+		    suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+		    i__3 = k1 - 1;
+		    suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 *
+			    b_dim1 + 1], &c__1);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__3 = k1 - 1;
+		    suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__3 = l1 - 1;
+		    sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l2 *
+			    b_dim1 + 1], &c__1);
+		    vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+		    slasy2_(&c_true, &c_false, isgn, &c__2, &c__2, &a[k1 + k1
+			    * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+			    c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__3 = *n;
+			for (j = 1; j <= i__3; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L110: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[2];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		    c__[k2 + l2 * c_dim1] = x[3];
+		}
+L120:
+		;
+	    }
+L130:
+	    ;
+	}
+    } else if (! notrna && ! notrnb) {
+	lnext = *n;
+	for (l = *n; l >= 1; --l) {
+	    if (l > lnext) {
+		goto L190;
+	    }
+	    if (l == 1) {
+		l1 = l;
+		l2 = l;
+	    } else {
+		if (b[l + (l - 1) * b_dim1] != 0.f) {
+		    l1 = l - 1;
+		    l2 = l;
+		    lnext = l - 2;
+		} else {
+		    l1 = l;
+		    l2 = l;
+		    lnext = l - 1;
+		}
+	    }
+	    knext = 1;
+	    i__1 = *m;
+	    for (k = 1; k <= i__1; ++k) {
+		if (k < knext) {
+		    goto L180;
+		}
+		if (k == *m) {
+		    k1 = k;
+		    k2 = k;
+		} else {
+		    if (a[k + 1 + k * a_dim1] != 0.f) {
+			k1 = k;
+			k2 = k + 1;
+			knext = k + 2;
+		    } else {
+			k1 = k;
+			k2 = k;
+			knext = k + 1;
+		    }
+		}
+		if (l1 == l2 && k1 == k2) {
+		    i__2 = k1 - 1;
+		    suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l1;
+/* Computing MIN */
+		    i__3 = l1 + 1;
+/* Computing MIN */
+		    i__4 = l1 + 1;
+		    sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    scaloc = 1.f;
+		    a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+		    da11 = dabs(a11);
+		    if (da11 <= smin) {
+			a11 = smin;
+			da11 = smin;
+			*info = 1;
+		    }
+		    db = dabs(vec[0]);
+		    if (da11 < 1.f && db > 1.f) {
+			if (db > bignum * da11) {
+			    scaloc = 1.f / db;
+			}
+		    }
+		    x[0] = vec[0] * scaloc / a11;
+		    if (scaloc != 1.f) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L140: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		} else if (l1 == l2 && k1 != k2) {
+		    i__2 = k1 - 1;
+		    suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = k1 - 1;
+		    suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    r__1 = -sgn * b[l1 + l1 * b_dim1];
+		    slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 *
+			     a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1,
+			    &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L150: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 == k2) {
+		    i__2 = k1 - 1;
+		    suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+			    sumr));
+		    i__2 = k1 - 1;
+		    suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__4,*n) * b_dim1], ldb);
+		    vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+			    sumr));
+		    r__1 = -sgn * a[k1 + k1 * a_dim1];
+		    slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1
+			    * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1,
+			     &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L160: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 != k2) {
+		    i__2 = k1 - 1;
+		    suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = k1 - 1;
+		    suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__4,*n) * b_dim1], ldb);
+		    vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = k1 - 1;
+		    suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__4,*n) * b_dim1], ldb);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__2 = k1 - 1;
+		    suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 *
+			    c_dim1 + 1], &c__1);
+		    i__2 = *n - l2;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+/* Computing MIN */
+		    i__4 = l2 + 1;
+		    sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__4,*n) * b_dim1], ldb);
+		    vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+		    slasy2_(&c_true, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 *
+			     a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+			    c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__2 = *n;
+			for (j = 1; j <= i__2; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L170: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[2];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		    c__[k2 + l2 * c_dim1] = x[3];
+		}
+L180:
+		;
+	    }
+L190:
+	    ;
+	}
+    } else if (notrna && ! notrnb) {
+	lnext = *n;
+	for (l = *n; l >= 1; --l) {
+	    if (l > lnext) {
+		goto L250;
+	    }
+	    if (l == 1) {
+		l1 = l;
+		l2 = l;
+	    } else {
+		if (b[l + (l - 1) * b_dim1] != 0.f) {
+		    l1 = l - 1;
+		    l2 = l;
+		    lnext = l - 2;
+		} else {
+		    l1 = l;
+		    l2 = l;
+		    lnext = l - 1;
+		}
+	    }
+	    knext = *m;
+	    for (k = *m; k >= 1; --k) {
+		if (k > knext) {
+		    goto L240;
+		}
+		if (k == 1) {
+		    k1 = k;
+		    k2 = k;
+		} else {
+		    if (a[k + (k - 1) * a_dim1] != 0.f) {
+			k1 = k - 1;
+			k2 = k;
+			knext = k - 2;
+		    } else {
+			k1 = k;
+			k2 = k;
+			knext = k - 1;
+		    }
+		}
+		if (l1 == l2 && k1 == k2) {
+		    i__1 = *m - k1;
+/* Computing MIN */
+		    i__2 = k1 + 1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+		    suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l1;
+/* Computing MIN */
+		    i__2 = l1 + 1;
+/* Computing MIN */
+		    i__3 = l1 + 1;
+		    sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    scaloc = 1.f;
+		    a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+		    da11 = dabs(a11);
+		    if (da11 <= smin) {
+			a11 = smin;
+			da11 = smin;
+			*info = 1;
+		    }
+		    db = dabs(vec[0]);
+		    if (da11 < 1.f && db > 1.f) {
+			if (db > bignum * da11) {
+			    scaloc = 1.f / db;
+			}
+		    }
+		    x[0] = vec[0] * scaloc / a11;
+		    if (scaloc != 1.f) {
+			i__1 = *n;
+			for (j = 1; j <= i__1; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L200: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		} else if (l1 == l2 && k1 != k2) {
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    r__1 = -sgn * b[l1 + l1 * b_dim1];
+		    slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1
+			    * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1,
+			     &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__1 = *n;
+			for (j = 1; j <= i__1; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L210: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 == k2) {
+		    i__1 = *m - k1;
+/* Computing MIN */
+		    i__2 = k1 + 1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+		    suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+			    sumr));
+		    i__1 = *m - k1;
+/* Computing MIN */
+		    i__2 = k1 + 1;
+/* Computing MIN */
+		    i__3 = k1 + 1;
+		    suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__3,*n) * b_dim1], ldb);
+		    vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+			    sumr));
+		    r__1 = -sgn * a[k1 + k1 * a_dim1];
+		    slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1
+			    * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1,
+			     &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__1 = *n;
+			for (j = 1; j <= i__1; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L220: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[1];
+		} else if (l1 != l2 && k1 != k2) {
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__3,*n) * b_dim1], ldb);
+		    vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l1 + min(i__3,*n) * b_dim1], ldb);
+		    vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+		    i__1 = *m - k2;
+/* Computing MIN */
+		    i__2 = k2 + 1;
+/* Computing MIN */
+		    i__3 = k2 + 1;
+		    suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+			    c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+		    i__1 = *n - l2;
+/* Computing MIN */
+		    i__2 = l2 + 1;
+/* Computing MIN */
+		    i__3 = l2 + 1;
+		    sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+			     &b[l2 + min(i__3,*n) * b_dim1], ldb);
+		    vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+		    slasy2_(&c_false, &c_true, isgn, &c__2, &c__2, &a[k1 + k1
+			    * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+			    c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+		    if (ierr != 0) {
+			*info = 1;
+		    }
+		    if (scaloc != 1.f) {
+			i__1 = *n;
+			for (j = 1; j <= i__1; ++j) {
+			    sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L230: */
+			}
+			*scale *= scaloc;
+		    }
+		    c__[k1 + l1 * c_dim1] = x[0];
+		    c__[k1 + l2 * c_dim1] = x[2];
+		    c__[k2 + l1 * c_dim1] = x[1];
+		    c__[k2 + l2 * c_dim1] = x[3];
+		}
+L240:
+		;
+	    }
+L250:
+	    ;
+	}
+    }
+}
diff --git a/relapack/src/strtri.c b/relapack/src/strtri.c
new file mode 100644
index 000000000..d35bbd49f
--- /dev/null
+++ b/relapack/src/strtri.c
@@ -0,0 +1,107 @@
+#include "relapack.h"
+
+static void RELAPACK_strtri_rec(const char *, const char *, const int *,
+    float *, const int *, int *);
+
+
+/** CTRTRI computes the inverse of a real upper or lower triangular matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's strtri.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html
+ * */
+void RELAPACK_strtri(
+    const char *uplo, const char *diag, const int *n,
+    float *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int nounit = LAPACK(lsame)(diag, "N");
+    const int unit = LAPACK(lsame)(diag, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (!nounit && !unit)
+        *info = -2;
+    else if (*n < 0)
+        *info = -3;
+    else if (*ldA < MAX(1, *n))
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("STRTRI", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower  ? 'L' : 'U';
+    const char cleandiag = nounit ? 'N' : 'U';
+
+    // check for singularity
+    if (nounit) {
+        int i;
+        for (i = 0; i < *n; i++)
+            if (A[i + *ldA * i] == 0) {
+                *info = i;
+                return;
+            }
+    }
+
+    // Recursive kernel
+    RELAPACK_strtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
+}
+
+
+/** strtri's recursive compute kernel */
+static void RELAPACK_strtri_rec(
+    const char *uplo, const char *diag, const int *n,
+    float *A, const int *ldA,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_STRTRI, 1)) {
+        // Unblocked
+        LAPACK(strti2)(uplo, diag, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const float ONE[]  = { 1. };
+    const float MONE[] = { -1. };
+
+    // Splitting
+    const int n1 = SREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    float *const A_TL = A;
+    float *const A_TR = A + *ldA * n1;
+    float *const A_BL = A             + n1;
+    float *const A_BR = A + *ldA * n1 + n1;
+
+    // recursion(A_TL)
+    RELAPACK_strtri_rec(uplo, diag, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = - A_BL * A_TL
+        BLAS(strmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
+        // A_BL = A_BR \ A_BL
+        BLAS(strsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TR = - A_TL * A_TR
+        BLAS(strmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
+        // A_TR = A_TR / A_BR
+        BLAS(strsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_strtri_rec(uplo, diag, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c
new file mode 100644
index 000000000..3aa6bf531
--- /dev/null
+++ b/relapack/src/zgbtrf.c
@@ -0,0 +1,230 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *,
+    const int *, double *, const int *, int *, double *, const int *, double *,
+    const int *, int *);
+
+
+/** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's zgbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html
+ * */
+void RELAPACK_zgbtrf(
+    const int *m, const int *n, const int *kl, const int *ku,
+    double *Ab, const int *ldAb, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kl < 0)
+        *info = -3;
+    else if (*ku < 0)
+        *info = -4;
+    else if (*ldAb < 2 * *kl + *ku + 1)
+        *info = -6;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZGBTRF", &minfo);
+        return;
+    }
+
+    // Constant
+    const double ZERO[] = { 0., 0. };
+
+    // Result upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    double *const A = Ab + 2 * kv;
+
+    // Zero upper diagonal fill-in elements
+    int i, j;
+    for (j = 0; j < *n; j++) {
+        double *const A_j = A + 2 * *ldA * j;
+        for (i = MAX(0, j - kv); i < j - *ku; i++)
+            A_j[2 * i] = A_j[2 * i + 1] = 0.;
+    }
+
+    // Allocate work space
+    const int n1 = ZREC_SPLIT(*n);
+    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+    const int nWorkl = (kv > n1) ? n1 : kv;
+    const int mWorku = (*kl > n1) ? n1 : *kl;
+    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double));
+    double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double));
+    LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
+    LAPACK(zlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
+
+    // Recursive kernel
+    RELAPACK_zgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
+
+    // Free work space
+    free(Workl);
+    free(Worku);
+}
+
+
+/** zgbtrf's recursive compute kernel */
+static void RELAPACK_zgbtrf_rec(
+    const int *m, const int *n, const int *kl, const int *ku,
+    double *Ab, const int *ldAb, int *ipiv,
+    double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) {
+        // Unblocked
+        LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1., 0. };
+    const double MONE[] = { -1., 0. };
+    const int    iONE[] = { 1 };
+
+    // Loop iterators
+    int i, j;
+
+    // Output upper band width
+    const int kv = *ku + *kl;
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    double *const A = Ab + 2 * kv;
+
+    // Splitting
+    const int n1  = MIN(ZREC_SPLIT(*n), *kl);
+    const int n2  = *n - n1;
+    const int m1  = MIN(n1, *m);
+    const int m2  = *m - m1;
+    const int mn1 = MIN(m1, n1);
+    const int mn2 = MIN(m2, n2);
+
+    // Ab_L *
+    //      Ab_BR
+    double *const Ab_L  = Ab;
+    double *const Ab_BR = Ab + 2 * *ldAb * n1;
+
+    // A_L A_R
+    double *const A_L = A;
+    double *const A_R = A + 2 * *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + 2 * *ldA * n1;
+    double *const A_BL = A                 + 2 * m1;
+    double *const A_BR = A + 2 * *ldA * n1 + 2 * m1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // Banded splitting
+    const int n21 = MIN(n2, kv - n1);
+    const int n22 = MIN(n2 - n21, n1);
+    const int m21 = MIN(m2, *kl - m1);
+    const int m22 = MIN(m2 - m21, m1);
+
+    //   n1 n21  n22
+    // m *  A_Rl ARr
+    double *const A_Rl = A_R;
+    double *const A_Rr = A_R + 2 * *ldA * n21;
+
+    //     n1    n21    n22
+    // m1  *     A_TRl  A_TRr
+    // m21 A_BLt A_BRtl A_BRtr
+    // m22 A_BLb A_BRbl A_BRbr
+    double *const A_TRl  = A_TR;
+    double *const A_TRr  = A_TR + 2 * *ldA * n21;
+    double *const A_BLt  = A_BL;
+    double *const A_BLb  = A_BL                  + 2 * m21;
+    double *const A_BRtl = A_BR;
+    double *const A_BRtr = A_BR + 2 * *ldA * n21;
+    double *const A_BRbl = A_BR                  + 2 * m21;
+    double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;
+
+    // recursion(Ab_L, ipiv_T)
+    RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
+
+    // Workl = A_BLb
+    LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
+
+    // partially redo swaps in A_L
+    for (i = 0; i < mn1; i++) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
+            else
+                BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
+        }
+    }
+
+    // apply pivots to A_Rl
+    LAPACK(zlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
+
+    // apply pivots to A_Rr columnwise
+    for (j = 0; j < n22; j++) {
+        double *const A_Rrj = A_Rr + 2 * *ldA * j;
+        for (i = j; i < mn1; i++) {
+            const int ip = ipiv_T[i] - 1;
+            if (ip != i) {
+                const double tmpr = A_Rrj[2 * i];
+                const double tmpc = A_Rrj[2 * i + 1];
+                A_Rrj[2 * i]     = A_Rrj[2 * ip];
+                A_Rrj[2 * i + 1] = A_Rrj[2 * ip + 1];
+                A_Rrj[2 * ip]     = tmpr;
+                A_Rrj[2 * ip + 1] = tmpc;
+            }
+        }
+    }
+
+    // A_TRl = A_TL \ A_TRl
+    BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+    // Worku = A_TRr
+    LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
+    // Worku = A_TL \ Worku
+    BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
+    // A_TRr = Worku
+    LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
+    // A_BRtl = A_BRtl - A_BLt * A_TRl
+    BLAS(zgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
+    // A_BRbl = A_BRbl - Workl * A_TRl
+    BLAS(zgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
+    // A_BRtr = A_BRtr - A_BLt * Worku
+    BLAS(zgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
+    // A_BRbr = A_BRbr - Workl * Worku
+    BLAS(zgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
+
+    // partially undo swaps in A_L
+    for (i = mn1 - 1; i >= 0; i--) {
+        const int ip = ipiv_T[i] - 1;
+        if (ip != i) {
+            if (ip < *kl)
+                BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
+            else
+                BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
+        }
+    }
+
+    // recursion(Ab_BR, ipiv_B)
+    RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+    if (*info)
+        *info += n1;
+    // shift pivots
+    for (i = 0; i < mn2; i++)
+        ipiv_B[i] += n1;
+}
diff --git a/relapack/src/zgemmt.c b/relapack/src/zgemmt.c
new file mode 100644
index 000000000..aa5930238
--- /dev/null
+++ b/relapack/src/zgemmt.c
@@ -0,0 +1,167 @@
+#include "relapack.h"
+
+static void RELAPACK_zgemmt_rec(const char *, const char *, const char *,
+    const int *, const int *, const double *, const double *, const int *,
+    const double *, const int *, const double *, double *, const int *);
+
+static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *,
+    const int *, const int *, const double *, const double *, const int *,
+    const double *, const int *, const double *, double *, const int *);
+
+
+/** ZGEMMT computes a matrix-matrix product with general matrices but updates
+ * only the upper or lower triangular part of the result matrix.
+ *
+ * This routine performs the same operation as the BLAS routine
+ * zgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
+ * but only updates the triangular part of C specified by uplo:
+ * If (*uplo == 'L'), only the lower triangular part of C is updated,
+ * otherwise the upper triangular part is updated.
+ * */
+void RELAPACK_zgemmt(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+
+#if HAVE_XGEMMT
+    BLAS(zgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+    return;
+#else
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int notransA = LAPACK(lsame)(transA, "N");
+    const int tranA = LAPACK(lsame)(transA, "T");
+    const int ctransA = LAPACK(lsame)(transA, "C");
+    const int notransB = LAPACK(lsame)(transB, "N");
+    const int tranB = LAPACK(lsame)(transB, "T");
+    const int ctransB = LAPACK(lsame)(transB, "C");
+    int info = 0;
+    if (!lower && !upper)
+        info = 1;
+    else if (!tranA && !ctransA && !notransA)
+        info = 2;
+    else if (!tranB && !ctransB && !notransB)
+        info = 3;
+    else if (*n < 0)
+        info = 4;
+    else if (*k < 0)
+        info = 5;
+    else if (*ldA < MAX(1, notransA ? *n : *k))
+        info = 8;
+    else if (*ldB < MAX(1, notransB ? *k : *n))
+        info = 10;
+    else if (*ldC < MAX(1, *n))
+        info = 13;
+    if (info) {
+        LAPACK(xerbla)("ZGEMMT", &info);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+    const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
+    const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');
+
+    // Recursive kernel
+    RELAPACK_zgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+#endif
+}
+
+
+/** zgemmt's recursive compute kernel */
+static void RELAPACK_zgemmt_rec(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+
+    if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) {
+        // Unblocked
+        RELAPACK_zgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+        return;
+    }
+
+    // Splitting
+    const int n1 = ZREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_T
+    // A_B
+    const double *const A_T = A;
+    const double *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);
+
+    // B_L B_R
+    const double *const B_L = B;
+    const double *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);
+
+    // C_TL C_TR
+    // C_BL C_BR
+    double *const C_TL = C;
+    double *const C_TR = C + 2 * *ldC * n1;
+    double *const C_BL = C                 + 2 * n1;
+    double *const C_BR = C + 2 * *ldC * n1 + 2 * n1;
+
+    // recursion(C_TL)
+    RELAPACK_zgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
+
+    if (*uplo == 'L')
+        // C_BL = alpha A_B B_L + beta C_BL
+        BLAS(zgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
+    else
+        // C_TR = alpha A_T B_R + beta C_TR
+        BLAS(zgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
+
+    // recursion(C_BR)
+    RELAPACK_zgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
+}
+
+
+/** zgemmt's unblocked compute kernel */
+static void RELAPACK_zgemmt_rec2(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const double *alpha, const double *A, const int *ldA,
+    const double *B, const int *ldB,
+    const double *beta, double *C, const int *ldC
+) {
+
+    const int incB = (*transB == 'N') ? 1 : *ldB;
+    const int incC = 1;
+
+    int i;
+    for (i = 0; i < *n; i++) {
+        // A_0
+        // A_i
+        const double *const A_0 = A;
+        const double *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);
+
+        // * B_i *
+        const double *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);
+
+        // * C_0i *
+        // * C_ii *
+        double *const C_0i = C + 2 * *ldC * i;
+        double *const C_ii = C + 2 * *ldC * i + 2 * i;
+
+        if (*uplo == 'L') {
+            const int nmi = *n - i;
+            if (*transA == 'N')
+                BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+            else
+                BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+        } else {
+            const int ip1 = i + 1;
+            if (*transA == 'N')
+                BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+            else
+                BLAS(zgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+        }
+    }
+}
diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c
new file mode 100644
index 000000000..cf8921e1f
--- /dev/null
+++ b/relapack/src/zgetrf.c
@@ -0,0 +1,117 @@
+#include "relapack.h"
+
+static void RELAPACK_zgetrf_rec(const int *, const int *, double *,
+    const int *, int *, int *);
+
+
+/** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's zgetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html
+ * */
+void RELAPACK_zgetrf(
+    const int *m, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    // Check arguments
+    *info = 0;
+    if (*m < 0)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZGETRF", &minfo);
+        return;
+    }
+
+    const int sn = MIN(*m, *n);
+
+    RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info);
+
+    // Right remainder
+    if (*m < *n) {
+        // Constants
+        const double ONE[]  = { 1., 0. };
+        const int    iONE[] = { 1 };
+
+        // Splitting
+        const int rn = *n - *m;
+
+        // A_L A_R
+        const double *const A_L = A;
+        double *const       A_R = A + 2 * *ldA * *m;
+
+        // A_R = apply(ipiv, A_R)
+        LAPACK(zlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
+        // A_R = A_L \ A_R
+        BLAS(ztrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
+    }
+}
+
+
+/** zgetrf's recursive compute kernel */
+static void RELAPACK_zgetrf_rec(
+    const int *m, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_ZGETRF, 1)) {
+        // Unblocked
+        LAPACK(zgetf2)(m, n, A, ldA, ipiv, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1., 0. };
+    const double MONE[] = { -1., 0. };
+    const int    iONE[] = { 1. };
+
+    // Splitting
+    const int n1 = ZREC_SPLIT(*n);
+    const int n2 = *n - n1;
+    const int m2 = *m - n1;
+
+    // A_L A_R
+    double *const A_L = A;
+    double *const A_R = A + 2 * *ldA * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + 2 * *ldA * n1;
+    double *const A_BL = A                 + 2 * n1;
+    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // ipiv_T
+    // ipiv_B
+    int *const ipiv_T = ipiv;
+    int *const ipiv_B = ipiv + n1;
+
+    // recursion(A_L, ipiv_T)
+    RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
+    // apply pivots to A_R
+    LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
+
+    // A_TR = A_TL \ A_TR
+    BLAS(ztrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+    // A_BR = A_BR - A_BL * A_TR
+    BLAS(zgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
+
+    // recursion(A_BR, ipiv_B)
+    RELAPACK_zgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
+    if (*info)
+        *info += n1;
+    // apply pivots to A_BL
+    LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
+    // shift pivots
+    int i;
+    for (i = 0; i < n2; i++)
+        ipiv_B[i] += n1;
+}
diff --git a/relapack/src/zhegst.c b/relapack/src/zhegst.c
new file mode 100644
index 000000000..d0ece2148
--- /dev/null
+++ b/relapack/src/zhegst.c
@@ -0,0 +1,212 @@
+#include "relapack.h"
+#if XSYGST_ALLOW_MALLOC
+#include "stdlib.h"
+#endif
+
+static void RELAPACK_zhegst_rec(const int *, const char *, const int *,
+    double *, const int *, const double *, const int *,
+    double *, const int *, int *);
+
+
+/** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
+ *
+ * This routine is functionally equivalent to LAPACK's zhegst.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html
+ * */
+void RELAPACK_zhegst(
+    const int *itype, const char *uplo, const int *n,
+    double *A, const int *ldA, const double *B, const int *ldB,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (*itype < 1 || *itype > 3)
+        *info = -1;
+    else if (!lower && !upper)
+        *info = -2;
+    else if (*n < 0)
+        *info = -3;
+    else if (*ldA < MAX(1, *n))
+        *info = -5;
+    else if (*ldB < MAX(1, *n))
+        *info = -7;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZHEGST", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Allocate work space
+    double *Work = NULL;
+    int    lWork = 0;
+#if XSYGST_ALLOW_MALLOC
+    const int n1 = ZREC_SPLIT(*n);
+    lWork = n1 * (*n - n1);
+    Work  = malloc(lWork * 2 * sizeof(double));
+    if (!Work)
+        lWork = 0;
+#endif
+
+    // recursive kernel
+    RELAPACK_zhegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
+
+    // Free work space
+#if XSYGST_ALLOW_MALLOC
+    if (Work)
+        free(Work);
+#endif
+}
+
+
+/** zhegst's recursive compute kernel */
+static void RELAPACK_zhegst_rec(
+    const int *itype, const char *uplo, const int *n,
+    double *A, const int *ldA, const double *B, const int *ldB,
+    double *Work, const int *lWork, int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_ZHEGST, 1)) {
+        // Unblocked
+        LAPACK(zhegs2)(itype, uplo, n, A, ldA, B, ldB, info);
+        return;
+    }
+
+    // Constants
+    const double ZERO[]  = { 0., 0. };
+    const double ONE[]   = { 1., 0. };
+    const double MONE[]  = { -1., 0. };
+    const double HALF[]  = { .5, 0. };
+    const double MHALF[] = { -.5, 0. };
+    const int    iONE[]  = { 1 };
+
+    // Loop iterator
+    int i;
+
+    // Splitting
+    const int n1 = ZREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + 2 * *ldA * n1;
+    double *const A_BL = A                 + 2 * n1;
+    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // B_TL B_TR
+    // B_BL B_BR
+    const double *const B_TL = B;
+    const double *const B_TR = B + 2 * *ldB * n1;
+    const double *const B_BL = B                 + 2 * n1;
+    const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+    // recursion(A_TL, B_TL)
+    RELAPACK_zhegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
+
+    if (*itype == 1)
+        if (*uplo == 'L') {
+            // A_BL = A_BL / B_TL'
+            BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+            if (*lWork >= n2 * n1) {
+                // T = -1/2 * B_BL * A_TL
+                BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            } else
+                // A_BL = A_BL - 1/2 B_BL * A_TL
+                BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
+            BLAS(zher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
+            if (*lWork >= n2 * n1)
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            else
+                // A_BL = A_BL - 1/2 B_BL * A_TL
+                BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BL = B_BR \ A_BL
+            BLAS(ztrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+        } else {
+            // A_TR = B_TL' \ A_TR
+            BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+            if (*lWork >= n2 * n1) {
+                // T = -1/2 * A_TL * B_TR
+                BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
+                // A_TR = A_BL + T
+                for (i = 0; i < n2; i++)
+                    BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            } else
+                // A_TR = A_TR - 1/2 A_TL * B_TR
+                BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
+            BLAS(zher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
+            if (*lWork >= n2 * n1)
+                // A_TR = A_BL + T
+                for (i = 0; i < n2; i++)
+                    BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            else
+                // A_TR = A_TR - 1/2 A_TL * B_TR
+                BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TR = A_TR / B_BR
+            BLAS(ztrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+        }
+    else
+        if (*uplo == 'L') {
+            // A_BL = A_BL * B_TL
+            BLAS(ztrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+            if (*lWork >= n2 * n1) {
+                // T = 1/2 * A_BR * B_BL
+                BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            } else
+                // A_BL = A_BL + 1/2 A_BR * B_BL
+                BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
+            BLAS(zher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
+            if (*lWork >= n2 * n1)
+                // A_BL = A_BL + T
+                for (i = 0; i < n1; i++)
+                    BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+            else
+                // A_BL = A_BL + 1/2 A_BR * B_BL
+                BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+            // A_BL = B_BR * A_BL
+            BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+        } else {
+            // A_TR = B_TL * A_TR
+            BLAS(ztrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+            if (*lWork >= n2 * n1) {
+                // T = 1/2 * B_TR * A_BR
+                BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
+                // A_TR = A_TR + T
+                for (i = 0; i < n2; i++)
+                    BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            } else
+                // A_TR = A_TR + 1/2 B_TR A_BR
+                BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
+            BLAS(zher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
+            if (*lWork >= n2 * n1)
+                // A_TR = A_TR + T
+                for (i = 0; i < n2; i++)
+                    BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+            else
+                // A_TR = A_TR + 1/2 B_TR * A_BR
+                BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+            // A_TR = A_TR * B_BR
+            BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+        }
+
+    // recursion(A_BR, B_BR)
+    RELAPACK_zhegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
+}
diff --git a/relapack/src/zhetrf.c b/relapack/src/zhetrf.c
new file mode 100644
index 000000000..ef4e1f5d5
--- /dev/null
+++ b/relapack/src/zhetrf.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *,
+    double *, const int *, int *, double *, const int *, int *);
+
+
+/** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's zhetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html
+ * */
+void RELAPACK_zhetrf(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(double));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZHETRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** zhetrf's recursive compute kernel */
+static void RELAPACK_zhetrf_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_ZHETRF, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_zhetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const double ONE[]  = { 1., 0. };
+    const double MONE[] = { -1., 0. };
+    const int    iONE[] = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = ZREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        double *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        double *const A_BL   = A                 + 2 * n1;
+        double *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        double *const A_BL_B = A                 + 2 * *n;
+        double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        double *const Work_BL =              Work                    + 2 * n1;
+        double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            double *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            double *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = ZREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        double *const A_TL_T = A + 2 * *ldA * n_rest;
+        double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        double *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        double *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        double *const Work_L  = Work;
+        double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/zhetrf_rec2.c b/relapack/src/zhetrf_rec2.c
new file mode 100644
index 000000000..867ea64e1
--- /dev/null
+++ b/relapack/src/zhetrf_rec2.c
@@ -0,0 +1,524 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+static int c__1 = 1;
+
+/** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
+ *
+ * This routine is a minor modification of LAPACK's zlahef.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int *
+	nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
+	doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    double d__1, d__2, d__3, d__4;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    double sqrt(double), d_imag(doublecomplex *);
+    void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *,
+	    doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static int j, k;
+    static double t, r1;
+    static doublecomplex d11, d21, d22;
+    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static double alpha;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    static int kstep;
+    extern /* Subroutine */ int zgemv_(char *, int *, int *,
+	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
+	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
+	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
+	    int *), zswap_(int *, doublecomplex *, int *,
+	    doublecomplex *, int *);
+    static double absakk;
+    extern /* Subroutine */ int zdscal_(int *, double *,
+	    doublecomplex *, int *);
+    static double colmax;
+    extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
+	    ;
+    extern int izamax_(int *, doublecomplex *, int *);
+    static double rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.) + 1.) / 8.;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	i__1 = k - 1;
+	zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	i__1 = k + kw * w_dim1;
+	i__2 = k + k * a_dim1;
+	d__1 = a[i__2].r;
+	w[i__1].r = d__1, w[i__1].i = 0.;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	    i__1 = k + kw * w_dim1;
+	    i__2 = k + kw * w_dim1;
+	    d__1 = w[i__2].r;
+	    w[i__1].r = d__1, w[i__1].i = 0.;
+	}
+	i__1 = k + kw * w_dim1;
+	absakk = (d__1 = w[i__1].r, abs(d__1));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+		    kw * w_dim1]), abs(d__2));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + k * a_dim1;
+	    d__1 = a[i__2].r;
+	    a[i__1].r = d__1, a[i__1].i = 0.;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - 1;
+		zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = imax + (kw - 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		d__1 = a[i__2].r;
+		w[i__1].r = d__1, w[i__1].i = 0.;
+		i__1 = k - imax;
+		zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		i__1 = k - imax;
+		zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    i__2 = imax + (kw - 1) * w_dim1;
+		    d__1 = w[i__2].r;
+		    w[i__1].r = d__1, w[i__1].i = 0.;
+		}
+		i__1 = k - imax;
+		jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+			 &c__1);
+		i__1 = jmax + (kw - 1) * w_dim1;
+		rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+			jmax + (kw - 1) * w_dim1]), abs(d__2));
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
+			    d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs(
+			    d__2));
+		    rowmax = max(d__3,d__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) {
+			kp = imax;
+			zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+				w_dim1 + 1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		d__1 = a[i__2].r;
+		a[i__1].r = d__1, a[i__1].i = 0.;
+		i__1 = kk - 1 - kp;
+		zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		i__1 = kk - 1 - kp;
+		zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    i__1 = k + k * a_dim1;
+		    r1 = 1. / a[i__1].r;
+		    i__1 = k - 1;
+		    zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    i__1 = k - 1;
+		    zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		}
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    d_cnjg(&z__2, &d21);
+		    z_div(&z__1, &w[k + kw * w_dim1], &z__2);
+		    d11.r = z__1.r, d11.i = z__1.i;
+		    z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+		    d22.r = z__1.r, d22.i = z__1.i;
+		    z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1. / (z__1.r - 1.);
+		    z__2.r = t, z__2.i = 0.;
+		    z_div(&z__1, &z__2, &d21);
+		    d21.r = z__1.r, d21.i = z__1.i;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+				.i;
+			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+				d21.r * z__2.i + d21.i * z__2.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			i__2 = j + k * a_dim1;
+			d_cnjg(&z__2, &d21);
+			i__3 = j + kw * w_dim1;
+			z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+				.i;
+			z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
+				z__2.r * z__3.i + z__2.i * z__3.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1;
+		zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		i__1 = k - 2;
+		zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    ++j;
+	}
+	++j;
+	if (jp != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+	}
+	if (j < *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	i__1 = k + k * w_dim1;
+	i__2 = k + k * a_dim1;
+	d__1 = a[i__2].r;
+	w[i__1].r = d__1, w[i__1].i = 0.;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
+		    w_dim1], &c__1);
+	}
+	i__1 = *n - k + 1;
+	i__2 = k - 1;
+	z__1.r = -1., z__1.i = -0.;
+	zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k
+		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+	i__1 = k + k * w_dim1;
+	i__2 = k + k * w_dim1;
+	d__1 = w[i__2].r;
+	w[i__1].r = d__1, w[i__1].i = 0.;
+	i__1 = k + k * w_dim1;
+	absakk = (d__1 = w[i__1].r, abs(d__1));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+		    k * w_dim1]), abs(d__2));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + k * a_dim1;
+	    d__1 = a[i__2].r;
+	    a[i__1].r = d__1, a[i__1].i = 0.;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - k;
+		zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = imax - k;
+		zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
+		i__1 = imax + (k + 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		d__1 = a[i__2].r;
+		w[i__1].r = d__1, w[i__1].i = 0.;
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
+			    imax + 1 + (k + 1) * w_dim1], &c__1);
+		}
+		i__1 = *n - k + 1;
+		i__2 = k - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1],
+			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
+			w_dim1], &c__1, (ftnlen)12);
+		i__1 = imax + (k + 1) * w_dim1;
+		i__2 = imax + (k + 1) * w_dim1;
+		d__1 = w[i__2].r;
+		w[i__1].r = d__1, w[i__1].i = 0.;
+		i__1 = imax - k;
+		jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+			;
+		i__1 = jmax + (k + 1) * w_dim1;
+		rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+			jmax + (k + 1) * w_dim1]), abs(d__2));
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
+			    d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs(
+			    d__2));
+		    rowmax = max(d__3,d__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (k + 1) * w_dim1;
+		    if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) {
+			kp = imax;
+			i__1 = *n - k + 1;
+			zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
+				k * w_dim1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		d__1 = a[i__2].r;
+		a[i__1].r = d__1, a[i__1].i = 0.;
+		i__1 = kp - kk - 1;
+		zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		i__1 = kp - kk - 1;
+		zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    i__1 = k + k * a_dim1;
+		    r1 = 1. / a[i__1].r;
+		    i__1 = *n - k;
+		    zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    i__1 = *n - k;
+		    zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = z__1.r, d11.i = z__1.i;
+		    d_cnjg(&z__2, &d21);
+		    z_div(&z__1, &w[k + k * w_dim1], &z__2);
+		    d22.r = z__1.r, d22.i = z__1.i;
+		    z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1. / (z__1.r - 1.);
+		    z__2.r = t, z__2.i = 0.;
+		    z_div(&z__1, &z__2, &d21);
+		    d21.r = z__1.r, d21.i = z__1.i;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			d_cnjg(&z__2, &d21);
+			i__3 = j + k * w_dim1;
+			z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+				.i;
+			z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
+				z__2.r * z__3.i + z__2.i * z__3.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+				.i;
+			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+				d21.r * z__2.i + d21.i * z__2.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = *n - k;
+		zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		i__1 = *n - k - 1;
+		zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    --j;
+	}
+	--j;
+	if (jp != jj && j >= 1) {
+	    zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j > 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/zhetrf_rook.c b/relapack/src/zhetrf_rook.c
new file mode 100644
index 000000000..15ceaeae7
--- /dev/null
+++ b/relapack/src/zhetrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *,
+    double *, const int *, int *, double *, const int *, int *);
+
+
+/** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's zhetrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html
+ * */
+void RELAPACK_zhetrf_rook(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(double));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZHETRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** zhetrf_rook's recursive compute kernel */
+static void RELAPACK_zhetrf_rook_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_zhetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const double ONE[]  = { 1., 0. };
+    const double MONE[] = { -1., 0. };
+    const int    iONE[] = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = ZREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        double *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        double *const A_BL   = A                 + 2 * n1;
+        double *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        double *const A_BL_B = A                 + 2 * *n;
+        double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        double *const Work_BL =              Work                    + 2 * n1;
+        double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            double *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            double *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = ZREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        double *const A_TL_T = A + 2 * *ldA * n_rest;
+        double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        double *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        double *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        double *const Work_L  = Work;
+        double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/zhetrf_rook_rec2.c b/relapack/src/zhetrf_rook_rec2.c
new file mode 100644
index 000000000..a56ad710b
--- /dev/null
+++ b/relapack/src/zhetrf_rook_rec2.c
@@ -0,0 +1,662 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+static int c__1 = 1;
+
+/** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
+ *
+ * This routine is a minor modification of LAPACK's zlahef_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n,
+	int *nb, int *kb, doublecomplex *a, int *lda, int *
+	ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    double d__1, d__2;
+    doublecomplex z__1, z__2, z__3, z__4, z__5;
+
+    /* Builtin functions */
+    double sqrt(double), d_imag(doublecomplex *);
+    void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *,
+	    doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static int j, k, p;
+    static double t, r1;
+    static doublecomplex d11, d21, d22;
+    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static logical done;
+    static int imax, jmax;
+    static double alpha;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    static double dtemp, sfmin;
+    static int itemp, kstep;
+    extern /* Subroutine */ int zgemv_(char *, int *, int *,
+	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
+	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
+	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
+	    int *), zswap_(int *, doublecomplex *, int *,
+	    doublecomplex *, int *);
+    extern double dlamch_(char *, ftnlen);
+    static double absakk;
+    extern /* Subroutine */ int zdscal_(int *, double *,
+	    doublecomplex *, int *);
+    static double colmax;
+    extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
+	    ;
+    extern int izamax_(int *, doublecomplex *, int *);
+    static double rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.) + 1.) / 8.;
+    sfmin = dlamch_("S", (ftnlen)1);
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	p = k;
+	if (k > 1) {
+	    i__1 = k - 1;
+	    zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
+		    c__1);
+	}
+	i__1 = k + kw * w_dim1;
+	i__2 = k + k * a_dim1;
+	d__1 = a[i__2].r;
+	w[i__1].r = d__1, w[i__1].i = 0.;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	    i__1 = k + kw * w_dim1;
+	    i__2 = k + kw * w_dim1;
+	    d__1 = w[i__2].r;
+	    w[i__1].r = d__1, w[i__1].i = 0.;
+	}
+	i__1 = k + kw * w_dim1;
+	absakk = (d__1 = w[i__1].r, abs(d__1));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+		    kw * w_dim1]), abs(d__2));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + kw * w_dim1;
+	    d__1 = w[i__2].r;
+	    a[i__1].r = d__1, a[i__1].i = 0.;
+	    if (k > 1) {
+		i__1 = k - 1;
+		zcopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
+			&c__1);
+	    }
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L12:
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			    w_dim1 + 1], &c__1);
+		}
+		i__1 = imax + (kw - 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		d__1 = a[i__2].r;
+		w[i__1].r = d__1, w[i__1].i = 0.;
+		i__1 = k - imax;
+		zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		i__1 = k - imax;
+		zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    i__2 = imax + (kw - 1) * w_dim1;
+		    d__1 = w[i__2].r;
+		    w[i__1].r = d__1, w[i__1].i = 0.;
+		}
+		if (imax != k) {
+		    i__1 = k - imax;
+		    jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) *
+			    w_dim1], &c__1);
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
+			    w[jmax + (kw - 1) * w_dim1]), abs(d__2));
+		} else {
+		    rowmax = 0.;
+		}
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+		    i__1 = itemp + (kw - 1) * w_dim1;
+		    dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+			    itemp + (kw - 1) * w_dim1]), abs(d__2));
+		    if (dtemp > rowmax) {
+			rowmax = dtemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (kw - 1) * w_dim1;
+		if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) {
+		    kp = imax;
+		    zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		}
+		if (! done) {
+		    goto L12;
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kstep == 2 && p != k) {
+		i__1 = p + p * a_dim1;
+		i__2 = k + k * a_dim1;
+		d__1 = a[i__2].r;
+		a[i__1].r = d__1, a[i__1].i = 0.;
+		i__1 = k - 1 - p;
+		zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+			a_dim1], lda);
+		i__1 = k - 1 - p;
+		zlacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
+		if (p > 1) {
+		    i__1 = p - 1;
+		    zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
+			    1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    zswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
+			    1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+			 ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		d__1 = a[i__2].r;
+		a[i__1].r = d__1, a[i__1].i = 0.;
+		i__1 = kk - 1 - kp;
+		zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		i__1 = kk - 1 - kp;
+		zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    i__1 = k + k * a_dim1;
+		    t = a[i__1].r;
+		    if (abs(t) >= sfmin) {
+			r1 = 1. / t;
+			i__1 = k - 1;
+			zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    } else {
+			i__1 = k - 1;
+			for (ii = 1; ii <= i__1; ++ii) {
+			    i__2 = ii + k * a_dim1;
+			    i__3 = ii + k * a_dim1;
+			    z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L14: */
+			}
+		    }
+		    i__1 = k - 1;
+		    zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		}
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    d_cnjg(&z__2, &d21);
+		    z_div(&z__1, &w[k + kw * w_dim1], &z__2);
+		    d11.r = z__1.r, d11.i = z__1.i;
+		    z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+		    d22.r = z__1.r, d22.i = z__1.i;
+		    z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1. / (z__1.r - 1.);
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+				.i;
+			z_div(&z__2, &z__3, &d21);
+			z__1.r = t * z__2.r, z__1.i = t * z__2.i;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			i__2 = j + k * a_dim1;
+			i__3 = j + kw * w_dim1;
+			z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+				.i;
+			d_cnjg(&z__5, &d21);
+			z_div(&z__2, &z__3, &z__5);
+			z__1.r = t * z__2.r, z__1.i = t * z__2.i;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1;
+		zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+		i__1 = k - 2;
+		zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    ++j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	++j;
+	if (jp2 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	++jj;
+	if (kstep == 2 && jp1 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	if (j < *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	p = k;
+	i__1 = k + k * w_dim1;
+	i__2 = k + k * a_dim1;
+	d__1 = a[i__2].r;
+	w[i__1].r = d__1, w[i__1].i = 0.;
+	if (k < *n) {
+	    i__1 = *n - k;
+	    zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
+		    w_dim1], &c__1);
+	}
+	if (k > 1) {
+	    i__1 = *n - k + 1;
+	    i__2 = k - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &
+		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
+		    ftnlen)12);
+	    i__1 = k + k * w_dim1;
+	    i__2 = k + k * w_dim1;
+	    d__1 = w[i__2].r;
+	    w[i__1].r = d__1, w[i__1].i = 0.;
+	}
+	i__1 = k + k * w_dim1;
+	absakk = (d__1 = w[i__1].r, abs(d__1));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+		    k * w_dim1]), abs(d__2));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = k + k * a_dim1;
+	    i__2 = k + k * w_dim1;
+	    d__1 = w[i__2].r;
+	    a[i__1].r = d__1, a[i__1].i = 0.;
+	    if (k < *n) {
+		i__1 = *n - k;
+		zcopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
+			a_dim1], &c__1);
+	    }
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L72:
+		i__1 = imax - k;
+		zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = imax - k;
+		zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
+		i__1 = imax + (k + 1) * w_dim1;
+		i__2 = imax + imax * a_dim1;
+		d__1 = a[i__2].r;
+		w[i__1].r = d__1, w[i__1].i = 0.;
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
+			    imax + 1 + (k + 1) * w_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = *n - k + 1;
+		    i__2 = k - 1;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1]
+			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
+			    1) * w_dim1], &c__1, (ftnlen)12);
+		    i__1 = imax + (k + 1) * w_dim1;
+		    i__2 = imax + (k + 1) * w_dim1;
+		    d__1 = w[i__2].r;
+		    w[i__1].r = d__1, w[i__1].i = 0.;
+		}
+		if (imax != k) {
+		    i__1 = imax - k;
+		    jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+			    c__1);
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
+			    w[jmax + (k + 1) * w_dim1]), abs(d__2));
+		} else {
+		    rowmax = 0.;
+		}
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+		    i__1 = itemp + (k + 1) * w_dim1;
+		    dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+			    itemp + (k + 1) * w_dim1]), abs(d__2));
+		    if (dtemp > rowmax) {
+			rowmax = dtemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (k + 1) * w_dim1;
+		if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) {
+		    kp = imax;
+		    i__1 = *n - k + 1;
+		    zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    i__1 = *n - k + 1;
+		    zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		}
+		if (! done) {
+		    goto L72;
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kstep == 2 && p != k) {
+		i__1 = p + p * a_dim1;
+		i__2 = k + k * a_dim1;
+		d__1 = a[i__2].r;
+		a[i__1].r = d__1, a[i__1].i = 0.;
+		i__1 = p - k - 1;
+		zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
+			a_dim1], lda);
+		i__1 = p - k - 1;
+		zlacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
+		if (p < *n) {
+		    i__1 = *n - p;
+		    zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
+			    * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    zswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+		}
+		zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		d__1 = a[i__2].r;
+		a[i__1].r = d__1, a[i__1].i = 0.;
+		i__1 = kp - kk - 1;
+		zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		i__1 = kp - kk - 1;
+		zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    i__1 = k + k * a_dim1;
+		    t = a[i__1].r;
+		    if (abs(t) >= sfmin) {
+			r1 = 1. / t;
+			i__1 = *n - k;
+			zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    } else {
+			i__1 = *n;
+			for (ii = k + 1; ii <= i__1; ++ii) {
+			    i__2 = ii + k * a_dim1;
+			    i__3 = ii + k * a_dim1;
+			    z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L74: */
+			}
+		    }
+		    i__1 = *n - k;
+		    zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = z__1.r, d11.i = z__1.i;
+		    d_cnjg(&z__2, &d21);
+		    z_div(&z__1, &w[k + k * w_dim1], &z__2);
+		    d22.r = z__1.r, d22.i = z__1.i;
+		    z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    t = 1. / (z__1.r - 1.);
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			i__3 = j + k * w_dim1;
+			z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+				.i;
+			d_cnjg(&z__5, &d21);
+			z_div(&z__2, &z__3, &z__5);
+			z__1.r = t * z__2.r, z__1.i = t * z__2.i;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+				.i;
+			z_div(&z__2, &z__3, &d21);
+			z__1.r = t * z__2.r, z__1.i = t * z__2.i;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = *n - k;
+		zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+		i__1 = *n - k - 1;
+		zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    --j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	--j;
+	if (jp2 != jj && j >= 1) {
+	    zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	--jj;
+	if (kstep == 2 && jp1 != jj && j >= 1) {
+	    zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j > 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/zlauum.c b/relapack/src/zlauum.c
new file mode 100644
index 000000000..490dcc82e
--- /dev/null
+++ b/relapack/src/zlauum.c
@@ -0,0 +1,87 @@
+#include "relapack.h"
+
+static void RELAPACK_zlauum_rec(const char *, const int *, double *,
+    const int *, int *);
+
+
+/** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
+ *
+ * This routine is functionally equivalent to LAPACK's zlauum.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html
+ * */
+void RELAPACK_zlauum(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZLAUUM", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_zlauum_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** zlauum's recursive compute kernel */
+static void RELAPACK_zlauum_rec(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) {
+        // Unblocked
+        LAPACK(zlauu2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[] = { 1., 0. };
+
+    // Splitting
+    const int n1 = ZREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + 2 * *ldA * n1;
+    double *const A_BL = A                 + 2 * n1;
+    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_zlauum_rec(uplo, &n1, A_TL, ldA, info);
+
+    if (*uplo == 'L') {
+        // A_TL = A_TL + A_BL' * A_BL
+        BLAS(zherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
+        // A_BL = A_BR' * A_BL
+        BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TL = A_TL + A_TR * A_TR'
+        BLAS(zherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
+        // A_TR = A_TR * A_BR'
+        BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_zlauum_rec(uplo, &n2, A_BR, ldA, info);
+}
diff --git a/relapack/src/zpbtrf.c b/relapack/src/zpbtrf.c
new file mode 100644
index 000000000..37e711c9d
--- /dev/null
+++ b/relapack/src/zpbtrf.c
@@ -0,0 +1,157 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *,
+    double *, const int *, double *, const int *, int *);
+
+
+/** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's zpbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html
+ * */
+void RELAPACK_zpbtrf(
+    const char *uplo, const int *n, const int *kd,
+    double *Ab, const int *ldAb,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*kd < 0)
+        *info = -3;
+    else if (*ldAb < *kd + 1)
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZPBTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Constant
+    const double ZERO[] = { 0., 0. };
+
+    // Allocate work space
+    const int n1 = ZREC_SPLIT(*n);
+    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+    double *Work = malloc(mWork * nWork * 2 * sizeof(double));
+    LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
+
+    // Recursive kernel
+    RELAPACK_zpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
+
+    // Free work space
+    free(Work);
+}
+
+
+/** zpbtrf's recursive compute kernel */
+static void RELAPACK_zpbtrf_rec(
+    const char *uplo, const int *n, const int *kd,
+    double *Ab, const int *ldAb,
+    double *Work, const int *ldWork,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) {
+        // Unblocked
+        LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1., 0. };
+    const double MONE[] = { -1., 0. };
+
+    // Unskew A
+    const int ldA[] = { *ldAb - 1 };
+    double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
+
+    // Splitting
+    const int n1 = MIN(ZREC_SPLIT(*n), *kd);
+    const int n2 = *n - n1;
+
+    // * *
+    // * Ab_BR
+    double *const Ab_BR = Ab + 2 * *ldAb * n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + 2 * *ldA * n1;
+    double *const A_BL = A                 + 2 * n1;
+    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_zpotrf(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    // Banded splitting
+    const int n21 = MIN(n2, *kd - n1);
+    const int n22 = MIN(n2 - n21, *kd);
+
+    //     n1    n21    n22
+    // n1  *     A_TRl  A_TRr
+    // n21 A_BLt A_BRtl A_BRtr
+    // n22 A_BLb A_BRbl A_BRbr
+    double *const A_TRl  = A_TR;
+    double *const A_TRr  = A_TR + 2 * *ldA * n21;
+    double *const A_BLt  = A_BL;
+    double *const A_BLb  = A_BL                   + 2 * n21;
+    double *const A_BRtl = A_BR;
+    double *const A_BRtr = A_BR + 2 * *ldA * n21;
+    double *const A_BRbl = A_BR                   + 2 * n21;
+    double *const A_BRbr = A_BR + 2 * *ldA * n21  + 2 * n21;
+
+    if (*uplo == 'L') {
+        // A_BLt = ABLt / A_TL'
+        BLAS(ztrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
+        // A_BRtl = A_BRtl - A_BLt * A_BLt'
+        BLAS(zherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
+        // Work = A_BLb
+        LAPACK(zlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
+        // Work = Work / A_TL'
+        BLAS(ztrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRbl = A_BRbl - Work * A_BLt'
+        BLAS(zgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
+        // A_BRbr = A_BRbr - Work * Work'
+        BLAS(zherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_BLb = Work
+        LAPACK(zlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
+    } else {
+        // A_TRl = A_TL' \ A_TRl
+        BLAS(ztrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+        // A_BRtl = A_BRtl - A_TRl' * A_TRl
+        BLAS(zherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
+        // Work = A_TRr
+        LAPACK(zlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
+        // Work = A_TL' \ Work
+        BLAS(ztrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
+        // A_BRtr = A_BRtr - A_TRl' * Work
+        BLAS(zgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
+        // A_BRbr = A_BRbr - Work' * Work
+        BLAS(zherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+        // A_TRr = Work
+        LAPACK(zlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
+    }
+
+    // recursion(A_BR)
+    if (*kd > n1)
+        RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info);
+    else
+        RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/zpotrf.c b/relapack/src/zpotrf.c
new file mode 100644
index 000000000..411ac5fc0
--- /dev/null
+++ b/relapack/src/zpotrf.c
@@ -0,0 +1,92 @@
+#include "relapack.h"
+
+static void RELAPACK_zpotrf_rec(const char *, const int *, double *,
+        const int *, int *);
+
+
+/** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's zpotrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html
+ * */
+void RELAPACK_zpotrf(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZPOTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Recursive kernel
+    RELAPACK_zpotrf_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** zpotrf's recursive compute kernel */
+static void RELAPACK_zpotrf_rec(
+    const char *uplo, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) {
+        // Unblocked
+        LAPACK(zpotf2)(uplo, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1., 0. };
+    const double MONE[] = { -1., 0. };
+
+    // Splitting
+    const int n1 = ZREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + 2 * *ldA * n1;
+    double *const A_BL = A                 + 2 * n1;
+    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_zpotrf_rec(uplo, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = A_BL / A_TL'
+        BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
+        // A_BR = A_BR - A_BL * A_BL'
+        BLAS(zherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
+    } else {
+        // A_TR = A_TL' \ A_TR
+        BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+        // A_BR = A_BR - A_TR' * A_TR
+        BLAS(zherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_zpotrf_rec(uplo, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}
diff --git a/relapack/src/zsytrf.c b/relapack/src/zsytrf.c
new file mode 100644
index 000000000..3be21563a
--- /dev/null
+++ b/relapack/src/zsytrf.c
@@ -0,0 +1,238 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *,
+    double *, const int *, int *, double *, const int *, int *);
+
+
+/** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's zsytrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html
+ * */
+void RELAPACK_zsytrf(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(double));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZSYTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy arguments
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** zsytrf's recursive compute kernel */
+static void RELAPACK_zsytrf_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_zsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const double ONE[]  = { 1., 0. };
+    const double MONE[] = { -1., 0. };
+    const int    iONE[] = { 1 };
+
+    // Loop iterator
+    int i;
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = ZREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        double *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2 = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        double *const A_BL   = A                 + 2 * n1;
+        double *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        double *const A_BL_B = A                 + 2 * *n;
+        double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        double *const Work_BL =              Work                    + 2 * n1;
+        double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            double *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            double *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info  = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = ZREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1  = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        double *const A_TL_T = A + 2 * *ldA * n_rest;
+        double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        double *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        double *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        double *const Work_L  = Work;
+        double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/zsytrf_rec2.c b/relapack/src/zsytrf_rec2.c
new file mode 100644
index 000000000..33902ee9e
--- /dev/null
+++ b/relapack/src/zsytrf_rec2.c
@@ -0,0 +1,452 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+static int c__1 = 1;
+
+/** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's zlasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int *
+	nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
+	doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    double d__1, d__2, d__3, d__4;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Builtin functions */
+    double sqrt(double), d_imag(doublecomplex *);
+    void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static int j, k;
+    static doublecomplex t, r1, d11, d21, d22;
+    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static double alpha;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zscal_(int *, doublecomplex *,
+	    doublecomplex *, int *);
+    static int kstep;
+    extern /* Subroutine */ int zgemv_(char *, int *, int *,
+	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
+	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
+	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
+	    int *), zswap_(int *, doublecomplex *, int *,
+	    doublecomplex *, int *);
+    static double absakk, colmax;
+    extern int izamax_(int *, doublecomplex *, int *);
+    static double rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.) + 1.) / 8.;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	if (k < *n) {
+	    i__1 = *n - k;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	}
+	kstep = 1;
+	i__1 = k + kw * w_dim1;
+	absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw *
+		w_dim1]), abs(d__2));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+		    kw * w_dim1]), abs(d__2));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = k - imax;
+		zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		}
+		i__1 = k - imax;
+		jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+			 &c__1);
+		i__1 = jmax + (kw - 1) * w_dim1;
+		rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+			jmax + (kw - 1) * w_dim1]), abs(d__2));
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
+			    d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs(
+			    d__2));
+		    rowmax = max(d__3,d__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (kw - 1) * w_dim1;
+		    if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+			    imax + (kw - 1) * w_dim1]), abs(d__2)) >= alpha *
+			    rowmax) {
+			kp = imax;
+			zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+				w_dim1 + 1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = kk - 1 - kp;
+		zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		if (kp > 1) {
+		    i__1 = kp - 1;
+		    zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+			    + 1], &c__1);
+		}
+		if (k < *n) {
+		    i__1 = *n - k;
+		    zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+			    + 1) * a_dim1], lda);
+		}
+		i__1 = *n - kk + 1;
+		zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
+		r1.r = z__1.r, r1.i = z__1.i;
+		i__1 = k - 1;
+		zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    z_div(&z__1, &w[k + kw * w_dim1], &d21);
+		    d11.r = z__1.r, d11.i = z__1.i;
+		    z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+		    d22.r = z__1.r, d22.i = z__1.i;
+		    z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
+		    z_div(&z__1, &c_b1, &z__2);
+		    t.r = z__1.r, t.i = z__1.i;
+		    z_div(&z__1, &t, &d21);
+		    d21.r = z__1.r, d21.i = z__1.i;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+				.i;
+			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+				d21.r * z__2.i + d21.i * z__2.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			i__2 = j + k * a_dim1;
+			i__3 = j + kw * w_dim1;
+			z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+				.i;
+			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+				d21.r * z__2.i + d21.i * z__2.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    ++j;
+	}
+	++j;
+	if (jp != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+	}
+	if (j < *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	i__1 = *n - k + 1;
+	zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+	i__1 = *n - k + 1;
+	i__2 = k - 1;
+	z__1.r = -1., z__1.i = -0.;
+	zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k
+		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+	kstep = 1;
+	i__1 = k + k * w_dim1;
+	absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k *
+		w_dim1]), abs(d__2));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+		    k * w_dim1]), abs(d__2));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	} else {
+	    if (absakk >= alpha * colmax) {
+		kp = k;
+	    } else {
+		i__1 = imax - k;
+		zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = *n - imax + 1;
+		zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+			1) * w_dim1], &c__1);
+		i__1 = *n - k + 1;
+		i__2 = k - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1],
+			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
+			w_dim1], &c__1, (ftnlen)12);
+		i__1 = imax - k;
+		jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+			;
+		i__1 = jmax + (k + 1) * w_dim1;
+		rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+			jmax + (k + 1) * w_dim1]), abs(d__2));
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+/* Computing MAX */
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
+			    d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs(
+			    d__2));
+		    rowmax = max(d__3,d__4);
+		}
+		if (absakk >= alpha * colmax * (colmax / rowmax)) {
+		    kp = k;
+		} else /* if(complicated condition) */ {
+		    i__1 = imax + (k + 1) * w_dim1;
+		    if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+			    imax + (k + 1) * w_dim1]), abs(d__2)) >= alpha *
+			    rowmax) {
+			kp = imax;
+			i__1 = *n - k + 1;
+			zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
+				k * w_dim1], &c__1);
+		    } else {
+			kp = imax;
+			kstep = 2;
+		    }
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kp != kk) {
+		i__1 = kp + kp * a_dim1;
+		i__2 = kk + kk * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = kp - kk - 1;
+		zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+			1) * a_dim1], lda);
+		if (kp < *n) {
+		    i__1 = *n - kp;
+		    zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+			    + kp * a_dim1], &c__1);
+		}
+		if (k > 1) {
+		    i__1 = k - 1;
+		    zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		}
+		zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
+		    r1.r = z__1.r, r1.i = z__1.i;
+		    i__1 = *n - k;
+		    zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = z__1.r, d11.i = z__1.i;
+		    z_div(&z__1, &w[k + k * w_dim1], &d21);
+		    d22.r = z__1.r, d22.i = z__1.i;
+		    z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
+		    z_div(&z__1, &c_b1, &z__2);
+		    t.r = z__1.r, t.i = z__1.i;
+		    z_div(&z__1, &t, &d21);
+		    d21.r = z__1.r, d21.i = z__1.i;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			i__3 = j + k * w_dim1;
+			z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+				.i;
+			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+				d21.r * z__2.i + d21.i * z__2.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+				.i;
+			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+				d21.r * z__2.i + d21.i * z__2.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -kp;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	jj = j;
+	jp = ipiv[j];
+	if (jp < 0) {
+	    jp = -jp;
+	    --j;
+	}
+	--j;
+	if (jp != jj && j >= 1) {
+	    zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j > 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/zsytrf_rook.c b/relapack/src/zsytrf_rook.c
new file mode 100644
index 000000000..c598f7b1e
--- /dev/null
+++ b/relapack/src/zsytrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *,
+    double *, const int *, int *, double *, const int *, int *);
+
+
+/** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's zsytrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html
+ * */
+void RELAPACK_zsytrf_rook(
+    const char *uplo, const int *n,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *lWork, int *info
+) {
+
+    // Required work size
+    const int cleanlWork = *n * (*n / 2);
+    int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+    minlWork = 1;
+#endif
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (*n < 0)
+        *info = -2;
+    else if (*ldA < MAX(1, *n))
+        *info = -4;
+    else if (*lWork < minlWork && *lWork != -1)
+        *info = -7;
+    else if (*lWork == -1) {
+        // Work size query
+        *Work = cleanlWork;
+        return;
+    }
+
+    // Ensure Work size
+    double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+    if (!*info && *lWork < cleanlWork) {
+        cleanWork = malloc(cleanlWork * 2 * sizeof(double));
+        if (!cleanWork)
+            *info = -7;
+    }
+#endif
+
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZSYTRF", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower ? 'L' : 'U';
+
+    // Dummy argument
+    int nout;
+
+    // Recursive kernel
+    RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+    if (cleanWork != Work)
+        free(cleanWork);
+#endif
+}
+
+
+/** zsytrf_rook's recursive compute kernel */
+static void RELAPACK_zsytrf_rook_rec(
+    const char *uplo, const int *n_full, const int *n, int *n_out,
+    double *A, const int *ldA, int *ipiv,
+    double *Work, const int *ldWork, int *info
+) {
+
+    // top recursion level?
+    const int top = *n_full == *n;
+
+    if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) {
+        // Unblocked
+        if (top) {
+            LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info);
+            *n_out = *n;
+        } else
+            RELAPACK_zsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+        return;
+    }
+
+    int info1, info2;
+
+    // Constants
+    const double ONE[]  = { 1., 0. };
+    const double MONE[] = { -1., 0. };
+    const int    iONE[] = { 1 };
+
+    const int n_rest = *n_full - *n;
+
+    if (*uplo == 'L') {
+        // Splitting (setup)
+        int n1 = ZREC_SPLIT(*n);
+        int n2 = *n - n1;
+
+        // Work_L *
+        double *const Work_L = Work;
+
+        // recursion(A_L)
+        int n1_out;
+        RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+        n1 = n1_out;
+
+        // Splitting (continued)
+        n2 = *n - n1;
+        const int n_full2   = *n_full - n1;
+
+        // *      *
+        // A_BL   A_BR
+        // A_BL_B A_BR_B
+        double *const A_BL   = A                 + 2 * n1;
+        double *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
+        double *const A_BL_B = A                 + 2 * *n;
+        double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+        // *        *
+        // Work_BL Work_BR
+        // *       *
+        // (top recursion level: use Work as Work_BR)
+        double *const Work_BL =              Work                    + 2 * n1;
+        double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+        const int ldWork_BR = top ? n2 : *ldWork;
+
+        // ipiv_T
+        // ipiv_B
+        int *const ipiv_B = ipiv + n1;
+
+        // A_BR = A_BR - A_BL Work_BL'
+        RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+        BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+        // recursion(A_BR)
+        int n2_out;
+        RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+        if (n2_out != n2) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // last column of A_BR
+            double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+            // last row of A_BL
+            double *const A_BL_b = A_BL + 2 * n2_out;
+
+            // last row of Work_BL
+            double *const Work_BL_b = Work_BL + 2 * n2_out;
+
+            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+            BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+        }
+        n2 = n2_out;
+
+        // shift pivots
+        int i;
+        for (i = 0; i < n2; i++)
+            if (ipiv_B[i] > 0)
+                ipiv_B[i] += n1;
+            else
+                ipiv_B[i] -= n1;
+
+        *info  = info1 || info2;
+        *n_out = n1 + n2;
+    } else {
+        // Splitting (setup)
+        int n2 = ZREC_SPLIT(*n);
+        int n1 = *n - n2;
+
+        // * Work_R
+        // (top recursion level: use Work as Work_R)
+        double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+        // recursion(A_R)
+        int n2_out;
+        RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+        const int n2_diff = n2 - n2_out;
+        n2 = n2_out;
+
+        // Splitting (continued)
+        n1 = *n - n2;
+        const int n_full1 = *n_full - n2;
+
+        // * A_TL_T A_TR_T
+        // * A_TL   A_TR
+        // * *      *
+        double *const A_TL_T = A + 2 * *ldA * n_rest;
+        double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+        double *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
+        double *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+        // Work_L *
+        // *      Work_TR
+        // *      *
+        // (top recursion level: Work_R was Work)
+        double *const Work_L  = Work;
+        double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+        const int ldWork_L = top ? n1 : *ldWork;
+
+        // A_TL = A_TL - A_TR Work_TR'
+        RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+        BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+        // recursion(A_TL)
+        int n1_out;
+        RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+        if (n1_out != n1) {
+            // undo 1 column of updates
+            const int n_restp1 = n_rest + 1;
+
+            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+            BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+        }
+        n1 = n1_out;
+
+        *info  = info2 || info1;
+        *n_out = n1 + n2;
+    }
+}
diff --git a/relapack/src/zsytrf_rook_rec2.c b/relapack/src/zsytrf_rook_rec2.c
new file mode 100644
index 000000000..9e111fe0c
--- /dev/null
+++ b/relapack/src/zsytrf_rook_rec2.c
@@ -0,0 +1,561 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+static int c__1 = 1;
+
+/** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's zlasyf_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n,
+	int *nb, int *kb, doublecomplex *a, int *lda, int *
+	ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    double d__1, d__2;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    double sqrt(double), d_imag(doublecomplex *);
+    void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static int j, k, p;
+    static doublecomplex t, r1, d11, d12, d21, d22;
+    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static logical done;
+    static int imax, jmax;
+    static double alpha;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    static double dtemp, sfmin;
+    extern /* Subroutine */ int zscal_(int *, doublecomplex *,
+	    doublecomplex *, int *);
+    static int itemp, kstep;
+    extern /* Subroutine */ int zgemv_(char *, int *, int *,
+	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
+	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
+	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
+	    int *), zswap_(int *, doublecomplex *, int *,
+	    doublecomplex *, int *);
+    extern double dlamch_(char *, ftnlen);
+    static double absakk, colmax;
+    extern int izamax_(int *, doublecomplex *, int *);
+    static double rowmax;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    *info = 0;
+    alpha = (sqrt(17.) + 1.) / 8.;
+    sfmin = dlamch_("S", (ftnlen)1);
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	k = *n;
+L10:
+	kw = *nb + k - *n;
+	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+	    goto L30;
+	}
+	kstep = 1;
+	p = k;
+	zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+	if (k < *n) {
+	    i__1 = *n - k;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
+		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+		    w_dim1 + 1], &c__1, (ftnlen)12);
+	}
+	i__1 = k + kw * w_dim1;
+	absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw *
+		w_dim1]), abs(d__2));
+	if (k > 1) {
+	    i__1 = k - 1;
+	    imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+	    i__1 = imax + kw * w_dim1;
+	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+		    kw * w_dim1]), abs(d__2));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L12:
+		zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+			w_dim1 + 1], &c__1);
+		i__1 = k - imax;
+		zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+			1 + (kw - 1) * w_dim1], &c__1);
+		if (k < *n) {
+		    i__1 = *n - k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
+			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+			    ftnlen)12);
+		}
+		if (imax != k) {
+		    i__1 = k - imax;
+		    jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) *
+			    w_dim1], &c__1);
+		    i__1 = jmax + (kw - 1) * w_dim1;
+		    rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
+			    w[jmax + (kw - 1) * w_dim1]), abs(d__2));
+		} else {
+		    rowmax = 0.;
+		}
+		if (imax > 1) {
+		    i__1 = imax - 1;
+		    itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+		    i__1 = itemp + (kw - 1) * w_dim1;
+		    dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+			    itemp + (kw - 1) * w_dim1]), abs(d__2));
+		    if (dtemp > rowmax) {
+			rowmax = dtemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (kw - 1) * w_dim1;
+		if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax
+			+ (kw - 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) {
+		    kp = imax;
+		    zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+			    w_dim1 + 1], &c__1);
+		}
+		if (! done) {
+		    goto L12;
+		}
+	    }
+	    kk = k - kstep + 1;
+	    kkw = *nb + kk - *n;
+	    if (kstep == 2 && p != k) {
+		i__1 = k - p;
+		zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+			a_dim1], lda);
+		zcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
+			c__1);
+		i__1 = *n - k + 1;
+		zswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
+			lda);
+		i__1 = *n - kk + 1;
+		zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+			 ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + k * a_dim1;
+		i__2 = kk + k * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = k - 1 - kp;
+		zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+			1) * a_dim1], lda);
+		zcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
+			c__1);
+		i__1 = *n - kk + 1;
+		zswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
+			 lda);
+		i__1 = *n - kk + 1;
+		zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+			w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+			c__1);
+		if (k > 1) {
+		    i__1 = k + k * a_dim1;
+		    if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k +
+			    k * a_dim1]), abs(d__2)) >= sfmin) {
+			z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
+			r1.r = z__1.r, r1.i = z__1.i;
+			i__1 = k - 1;
+			zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+		    } else /* if(complicated condition) */ {
+			i__1 = k + k * a_dim1;
+			if (a[i__1].r != 0. || a[i__1].i != 0.) {
+			    i__1 = k - 1;
+			    for (ii = 1; ii <= i__1; ++ii) {
+				i__2 = ii + k * a_dim1;
+				z_div(&z__1, &a[ii + k * a_dim1], &a[k + k *
+					a_dim1]);
+				a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L14: */
+			    }
+			}
+		    }
+		}
+	    } else {
+		if (k > 2) {
+		    i__1 = k - 1 + kw * w_dim1;
+		    d12.r = w[i__1].r, d12.i = w[i__1].i;
+		    z_div(&z__1, &w[k + kw * w_dim1], &d12);
+		    d11.r = z__1.r, d11.i = z__1.i;
+		    z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
+		    d22.r = z__1.r, d22.i = z__1.i;
+		    z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
+		    z_div(&z__1, &c_b1, &z__2);
+		    t.r = z__1.r, t.i = z__1.i;
+		    i__1 = k - 2;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + (k - 1) * a_dim1;
+			i__3 = j + (kw - 1) * w_dim1;
+			z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + kw * w_dim1;
+			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+				.i;
+			z_div(&z__2, &z__3, &d12);
+			z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
+				z__2.i + t.i * z__2.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			i__2 = j + k * a_dim1;
+			i__3 = j + kw * w_dim1;
+			z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + (kw - 1) * w_dim1;
+			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+				.i;
+			z_div(&z__2, &z__3, &d12);
+			z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
+				z__2.i + t.i * z__2.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L20: */
+		    }
+		}
+		i__1 = k - 1 + (k - 1) * a_dim1;
+		i__2 = k - 1 + (kw - 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k - 1 + k * a_dim1;
+		i__2 = k - 1 + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + k * a_dim1;
+		i__2 = k + kw * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k - 1] = -kp;
+	}
+	k -= kstep;
+	goto L10;
+L30:
+	j = k + 1;
+L60:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    ++j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	++j;
+	if (jp2 != jj && j <= *n) {
+	    i__1 = *n - j + 1;
+	    zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	jj = j - 1;
+	if (jp1 != jj && kstep == 2) {
+	    i__1 = *n - j + 1;
+	    zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+		    ;
+	}
+	if (j <= *n) {
+	    goto L60;
+	}
+	*kb = *n - k;
+    } else {
+	k = 1;
+L70:
+	if ((k >= *nb && *nb < *n) || k > *n) {
+	    goto L90;
+	}
+	kstep = 1;
+	p = k;
+	i__1 = *n - k + 1;
+	zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+	if (k > 1) {
+	    i__1 = *n - k + 1;
+	    i__2 = k - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &
+		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
+		    ftnlen)12);
+	}
+	i__1 = k + k * w_dim1;
+	absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k *
+		w_dim1]), abs(d__2));
+	if (k < *n) {
+	    i__1 = *n - k;
+	    imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+	    i__1 = imax + k * w_dim1;
+	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+		    k * w_dim1]), abs(d__2));
+	} else {
+	    colmax = 0.;
+	}
+	if (max(absakk,colmax) == 0.) {
+	    if (*info == 0) {
+		*info = k;
+	    }
+	    kp = k;
+	    i__1 = *n - k + 1;
+	    zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+		    c__1);
+	} else {
+	    if (! (absakk < alpha * colmax)) {
+		kp = k;
+	    } else {
+		done = FALSE_;
+L72:
+		i__1 = imax - k;
+		zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+			w_dim1], &c__1);
+		i__1 = *n - imax + 1;
+		zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+			1) * w_dim1], &c__1);
+		if (k > 1) {
+		    i__1 = *n - k + 1;
+		    i__2 = k - 1;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1]
+			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
+			    1) * w_dim1], &c__1, (ftnlen)12);
+		}
+		if (imax != k) {
+		    i__1 = imax - k;
+		    jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+			    c__1);
+		    i__1 = jmax + (k + 1) * w_dim1;
+		    rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
+			    w[jmax + (k + 1) * w_dim1]), abs(d__2));
+		} else {
+		    rowmax = 0.;
+		}
+		if (imax < *n) {
+		    i__1 = *n - imax;
+		    itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
+			    w_dim1], &c__1);
+		    i__1 = itemp + (k + 1) * w_dim1;
+		    dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+			    itemp + (k + 1) * w_dim1]), abs(d__2));
+		    if (dtemp > rowmax) {
+			rowmax = dtemp;
+			jmax = itemp;
+		    }
+		}
+		i__1 = imax + (k + 1) * w_dim1;
+		if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax
+			+ (k + 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) {
+		    kp = imax;
+		    i__1 = *n - k + 1;
+		    zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		    done = TRUE_;
+		} else if (p == jmax || rowmax <= colmax) {
+		    kp = imax;
+		    kstep = 2;
+		    done = TRUE_;
+		} else {
+		    p = imax;
+		    colmax = rowmax;
+		    imax = jmax;
+		    i__1 = *n - k + 1;
+		    zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+			    w_dim1], &c__1);
+		}
+		if (! done) {
+		    goto L72;
+		}
+	    }
+	    kk = k + kstep - 1;
+	    if (kstep == 2 && p != k) {
+		i__1 = p - k;
+		zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
+			lda);
+		i__1 = *n - p + 1;
+		zcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
+			c__1);
+		zswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+		zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+	    }
+	    if (kp != kk) {
+		i__1 = kp + k * a_dim1;
+		i__2 = kk + k * a_dim1;
+		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+		i__1 = kp - k - 1;
+		zcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
+			* a_dim1], lda);
+		i__1 = *n - kp + 1;
+		zcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
+			a_dim1], &c__1);
+		zswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+		zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+	    }
+	    if (kstep == 1) {
+		i__1 = *n - k + 1;
+		zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+			c__1);
+		if (k < *n) {
+		    i__1 = k + k * a_dim1;
+		    if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k +
+			    k * a_dim1]), abs(d__2)) >= sfmin) {
+			z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
+			r1.r = z__1.r, r1.i = z__1.i;
+			i__1 = *n - k;
+			zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+		    } else /* if(complicated condition) */ {
+			i__1 = k + k * a_dim1;
+			if (a[i__1].r != 0. || a[i__1].i != 0.) {
+			    i__1 = *n;
+			    for (ii = k + 1; ii <= i__1; ++ii) {
+				i__2 = ii + k * a_dim1;
+				z_div(&z__1, &a[ii + k * a_dim1], &a[k + k *
+					a_dim1]);
+				a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L74: */
+			    }
+			}
+		    }
+		}
+	    } else {
+		if (k < *n - 1) {
+		    i__1 = k + 1 + k * w_dim1;
+		    d21.r = w[i__1].r, d21.i = w[i__1].i;
+		    z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+		    d11.r = z__1.r, d11.i = z__1.i;
+		    z_div(&z__1, &w[k + k * w_dim1], &d21);
+		    d22.r = z__1.r, d22.i = z__1.i;
+		    z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
+			    d22.i + d11.i * d22.r;
+		    z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
+		    z_div(&z__1, &c_b1, &z__2);
+		    t.r = z__1.r, t.i = z__1.i;
+		    i__1 = *n;
+		    for (j = k + 2; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			i__3 = j + k * w_dim1;
+			z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+				z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+				.r;
+			i__4 = j + (k + 1) * w_dim1;
+			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+				.i;
+			z_div(&z__2, &z__3, &d21);
+			z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
+				z__2.i + t.i * z__2.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			i__2 = j + (k + 1) * a_dim1;
+			i__3 = j + (k + 1) * w_dim1;
+			z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+				z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+				.r;
+			i__4 = j + k * w_dim1;
+			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+				.i;
+			z_div(&z__2, &z__3, &d21);
+			z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
+				z__2.i + t.i * z__2.r;
+			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L80: */
+		    }
+		}
+		i__1 = k + k * a_dim1;
+		i__2 = k + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + k * a_dim1;
+		i__2 = k + 1 + k * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+		i__1 = k + 1 + (k + 1) * a_dim1;
+		i__2 = k + 1 + (k + 1) * w_dim1;
+		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+	    }
+	}
+	if (kstep == 1) {
+	    ipiv[k] = kp;
+	} else {
+	    ipiv[k] = -p;
+	    ipiv[k + 1] = -kp;
+	}
+	k += kstep;
+	goto L70;
+L90:
+	j = k - 1;
+L120:
+	kstep = 1;
+	jp1 = 1;
+	jj = j;
+	jp2 = ipiv[j];
+	if (jp2 < 0) {
+	    jp2 = -jp2;
+	    --j;
+	    jp1 = -ipiv[j];
+	    kstep = 2;
+	}
+	--j;
+	if (jp2 != jj && j >= 1) {
+	    zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	jj = j + 1;
+	if (jp1 != jj && kstep == 2) {
+	    zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+	}
+	if (j >= 1) {
+	    goto L120;
+	}
+	*kb = k - 1;
+    }
+    return;
+}
diff --git a/relapack/src/ztgsyl.c b/relapack/src/ztgsyl.c
new file mode 100644
index 000000000..2c8a35256
--- /dev/null
+++ b/relapack/src/ztgsyl.c
@@ -0,0 +1,268 @@
+#include "relapack.h"
+#include <math.h>
+
+static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *,
+    const int *, const double *, const int *, const double *, const int *,
+    double *, const int *, const double *, const int *, const double *,
+    const int *, double *, const int *, double *, double *, double *, int *);
+
+
+/** ZTGSYL solves the generalized Sylvester equation.
+ *
+ * This routine is functionally equivalent to LAPACK's ztgsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html
+ * */
+void RELAPACK_ztgsyl(
+    const char *trans, const int *ijob, const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC,
+    const double *D, const int *ldD, const double *E, const int *ldE,
+    double *F, const int *ldF,
+    double *scale, double *dif,
+    double *Work, const int *lWork, int *iWork, int *info
+) {
+
+    // Parse arguments
+    const int notran = LAPACK(lsame)(trans, "N");
+    const int tran = LAPACK(lsame)(trans, "C");
+
+    // Compute work buffer size
+    int lwmin = 1;
+    if (notran && (*ijob == 1 || *ijob == 2))
+        lwmin = MAX(1, 2 * *m * *n);
+    *info = 0;
+
+    // Check arguments
+    if (!tran && !notran)
+        *info = -1;
+    else if (notran && (*ijob < 0 || *ijob > 4))
+        *info = -2;
+    else if (*m <= 0)
+        *info = -3;
+    else if (*n <= 0)
+        *info = -4;
+    else if (*ldA < MAX(1, *m))
+        *info = -6;
+    else if (*ldB < MAX(1, *n))
+        *info = -8;
+    else if (*ldC < MAX(1, *m))
+        *info = -10;
+    else if (*ldD < MAX(1, *m))
+        *info = -12;
+    else if (*ldE < MAX(1, *n))
+        *info = -14;
+    else if (*ldF < MAX(1, *m))
+        *info = -16;
+    else if (*lWork < lwmin && *lWork != -1)
+        *info = -20;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZTGSYL", &minfo);
+        return;
+    }
+
+    if (*lWork == -1) {
+        // Work size query
+        *Work = lwmin;
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleantrans = notran ? 'N' : 'C';
+
+    // Constant
+    const double ZERO[] = { 0., 0. };
+
+    int isolve = 1;
+    int ifunc  = 0;
+    if (notran) {
+        if (*ijob >= 3) {
+            ifunc = *ijob - 2;
+            LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC);
+            LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF);
+        } else if (*ijob >= 1)
+            isolve = 2;
+    }
+
+    double scale2;
+    int iround;
+    for (iround = 1; iround <= isolve; iround++) {
+        *scale = 1;
+        double dscale = 0;
+        double dsum   = 1;
+        RELAPACK_ztgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
+        if (dscale != 0) {
+            if (*ijob == 1 || *ijob == 3)
+                *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
+            else
+                *dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
+        }
+        if (isolve == 2) {
+            if (iround == 1) {
+                if (notran)
+                    ifunc = *ijob;
+                scale2 = *scale;
+                LAPACK(zlacpy)("F", m, n, C, ldC, Work, m);
+                LAPACK(zlacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
+                LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC);
+                LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF);
+            } else {
+                LAPACK(zlacpy)("F", m, n, Work, m, C, ldC);
+                LAPACK(zlacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
+                *scale = scale2;
+            }
+        }
+    }
+}
+
+
+/** ztgsyl's recursive vompute kernel */
+static void RELAPACK_ztgsyl_rec(
+    const char *trans, const int *ifunc, const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC,
+    const double *D, const int *ldD, const double *E, const int *ldE,
+    double *F, const int *ldF,
+    double *scale, double *dsum, double *dscale,
+    int *info
+) {
+
+    if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) {
+        // Unblocked
+        LAPACK(ztgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1., 0. };
+    const double MONE[] = { -1., 0. };
+    const int    iONE[] = { 1 };
+
+    // Outputs
+    double scale1[] = { 1., 0. };
+    double scale2[] = { 1., 0. };
+    int    info1[]  = { 0 };
+    int    info2[]  = { 0 };
+
+    if (*m > *n) {
+        // Splitting
+        const int m1 = ZREC_SPLIT(*m);
+        const int m2 = *m - m1;
+
+        // A_TL A_TR
+        // 0    A_BR
+        const double *const A_TL = A;
+        const double *const A_TR = A + 2 * *ldA * m1;
+        const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
+
+        // C_T
+        // C_B
+        double *const C_T = C;
+        double *const C_B = C + 2 * m1;
+
+        // D_TL D_TR
+        // 0    D_BR
+        const double *const D_TL = D;
+        const double *const D_TR = D + 2 * *ldD * m1;
+        const double *const D_BR = D + 2 * *ldD * m1 + 2 * m1;
+
+        // F_T
+        // F_B
+        double *const F_T = F;
+        double *const F_B = F + 2 * m1;
+
+        if (*trans == 'N') {
+            // recursion(A_BR, B, C_B, D_BR, E, F_B)
+            RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
+            // C_T = C_T - A_TR * C_B
+            BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+            // F_T = F_T - D_TR * C_B
+            BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
+            // recursion(A_TL, B, C_T, D_TL, E, F_T)
+            RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
+            }
+        } else {
+            // recursion(A_TL, B, C_T, D_TL, E, F_T)
+            RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
+            // apply scale
+            if (scale1[0] != 1)
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
+            // C_B = C_B - A_TR^H * C_T
+            BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+            // C_B = C_B - D_TR^H * F_T
+            BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
+            // recursion(A_BR, B, C_B, D_BR, E, F_B)
+            RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
+            }
+        }
+    } else {
+        // Splitting
+        const int n1 = ZREC_SPLIT(*n);
+        const int n2 = *n - n1;
+
+        // B_TL B_TR
+        // 0    B_BR
+        const double *const B_TL = B;
+        const double *const B_TR = B + 2 * *ldB * n1;
+        const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+        // C_L C_R
+        double *const C_L = C;
+        double *const C_R = C + 2 * *ldC * n1;
+
+        // E_TL E_TR
+        // 0    E_BR
+        const double *const E_TL = E;
+        const double *const E_TR = E + 2 * *ldE * n1;
+        const double *const E_BR = E + 2 * *ldE * n1 + 2 * n1;
+
+        // F_L F_R
+        double *const F_L = F;
+        double *const F_R = F + 2 * *ldF * n1;
+
+        if (*trans == 'N') {
+            // recursion(A, B_TL, C_L, D, E_TL, F_L)
+            RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
+            // C_R = C_R + F_L * B_TR
+            BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
+            // F_R = F_R + F_L * E_TR
+            BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
+            // recursion(A, B_BR, C_R, D, E_BR, F_R)
+            RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
+            }
+        } else {
+            // recursion(A, B_BR, C_R, D, E_BR, F_R)
+            RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
+            // apply scale
+            if (scale1[0] != 1)
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
+            // F_L = F_L + C_R * B_TR
+            BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
+            // F_L = F_L + F_R * E_TR
+            BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
+            // recursion(A, B_TL, C_L, D, E_TL, F_L)
+            RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
+            // apply scale
+            if (scale2[0] != 1) {
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
+            }
+        }
+    }
+
+    *scale = scale1[0] * scale2[0];
+    *info  = info1[0] || info2[0];
+}
diff --git a/relapack/src/ztrsyl.c b/relapack/src/ztrsyl.c
new file mode 100644
index 000000000..82b2c8803
--- /dev/null
+++ b/relapack/src/ztrsyl.c
@@ -0,0 +1,163 @@
+#include "relapack.h"
+
+static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *,
+    const int *, const int *, const double *, const int *, const double *,
+    const int *, double *, const int *, double *, int *);
+
+
+/** ZTRSYL solves the complex Sylvester matrix equation.
+ *
+ * This routine is functionally equivalent to LAPACK's ztrsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html
+ * */
+void RELAPACK_ztrsyl(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC, double *scale,
+    int *info
+) {
+
+    // Check arguments
+    const int notransA = LAPACK(lsame)(tranA, "N");
+    const int ctransA = LAPACK(lsame)(tranA, "C");
+    const int notransB = LAPACK(lsame)(tranB, "N");
+    const int ctransB = LAPACK(lsame)(tranB, "C");
+    *info = 0;
+    if (!ctransA && !notransA)
+        *info = -1;
+    else if (!ctransB && !notransB)
+        *info = -2;
+    else if (*isgn != 1 && *isgn != -1)
+        *info = -3;
+    else if (*m < 0)
+        *info = -4;
+    else if (*n < 0)
+        *info = -5;
+    else if (*ldA < MAX(1, *m))
+        *info = -7;
+    else if (*ldB < MAX(1, *n))
+        *info = -9;
+    else if (*ldC < MAX(1, *m))
+        *info = -11;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZTRSYL", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleantranA = notransA ? 'N' : 'C';
+    const char cleantranB = notransB ? 'N' : 'C';
+
+    // Recursive kernel
+    RELAPACK_ztrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+
+
+/** ztrsyl's recursive compute kernel */
+static void RELAPACK_ztrsyl_rec(
+    const char *tranA, const char *tranB, const int *isgn,
+    const int *m, const int *n,
+    const double *A, const int *ldA, const double *B, const int *ldB,
+    double *C, const int *ldC, double *scale,
+    int *info
+) {
+
+    if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) {
+        // Unblocked
+        RELAPACK_ztrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1., 0. };
+    const double MONE[] = { -1., 0. };
+    const double MSGN[] = { -*isgn, 0. };
+    const int    iONE[] = { 1 };
+
+    // Outputs
+    double scale1[] = { 1., 0. };
+    double scale2[] = { 1., 0. };
+    int    info1[]  = { 0 };
+    int    info2[]  = { 0 };
+
+    if (*m > *n) {
+        // Splitting
+        const int m1 = ZREC_SPLIT(*m);
+        const int m2 = *m - m1;
+
+        // A_TL A_TR
+        // 0    A_BR
+        const double *const A_TL = A;
+        const double *const A_TR = A + 2 * *ldA * m1;
+        const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
+
+        // C_T
+        // C_B
+        double *const C_T = C;
+        double *const C_B = C + 2 * m1;
+
+        if (*tranA == 'N') {
+            // recusion(A_BR, B, C_B)
+            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
+            // C_T = C_T - A_TR * C_B
+            BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+            // recusion(A_TL, B, C_T)
+            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+        } else {
+            // recusion(A_TL, B, C_T)
+            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
+            // C_B = C_B - A_TR' * C_T
+            BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+            // recusion(A_BR, B, C_B)
+            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
+        }
+    } else {
+        // Splitting
+        const int n1 = ZREC_SPLIT(*n);
+        const int n2 = *n - n1;
+
+        // B_TL B_TR
+        // 0    B_BR
+        const double *const B_TL = B;
+        const double *const B_TR = B + 2 * *ldB * n1;
+        const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+        // C_L C_R
+        double *const C_L = C;
+        double *const C_R = C + 2 * *ldC * n1;
+
+        if (*tranB == 'N') {
+            // recusion(A, B_TL, C_L)
+            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
+            // C_R = C_R -/+ C_L * B_TR
+            BLAS(zgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
+            // recusion(A, B_BR, C_R)
+            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+        } else {
+            // recusion(A, B_BR, C_R)
+            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
+            // C_L = C_L -/+ C_R * B_TR'
+            BLAS(zgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
+            // recusion(A, B_TL, C_L)
+            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
+            // apply scale
+            if (scale2[0] != 1)
+                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+        }
+    }
+
+    *scale = scale1[0] * scale2[0];
+    *info  = info1[0] || info2[0];
+}
diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c
new file mode 100644
index 000000000..526ab097c
--- /dev/null
+++ b/relapack/src/ztrsyl_rec2.c
@@ -0,0 +1,394 @@
+/*  -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "../config.h"
+#include "f2c.h"
+
+#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
+doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
+    extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
+    doublecomplex result;
+    zdotu_(&result, n, x, incx, y, incy);
+    return result;
+}
+#define zdotu_ zdotu_fun
+
+doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
+    extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
+    doublecomplex result;
+    zdotc_(&result, n, x, incx, y, incy);
+    return result;
+}
+#define zdotc_ zdotc_fun
+#endif
+
+#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
+doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) {
+    extern void zladiv_(doublecomplex *, doublecomplex *, doublecomplex *);
+    doublecomplex result;
+    zladiv_(&result, a, b);
+    return result;
+}
+#define zladiv_ zladiv_fun
+#endif
+
+/* Table of constant values */
+
+static int c__1 = 1;
+
+/** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
+ *
+ * This routine is an exact copy of LAPACK's ztrsyl.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * */
+/* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int
+	*isgn, int *m, int *n, doublecomplex *a, int *lda,
+	doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc,
+	double *scale, int *info, ftnlen trana_len, ftnlen tranb_len)
+{
+    /* System generated locals */
+    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4;
+    double d__1, d__2;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    double d_imag(doublecomplex *);
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static int j, k, l;
+    static doublecomplex a11;
+    static double db;
+    static doublecomplex x11;
+    static double da11;
+    static doublecomplex vec;
+    static double dum[1], eps, sgn, smin;
+    static doublecomplex suml, sumr;
+    extern int lsame_(char *, char *, ftnlen, ftnlen);
+    /* Double Complex */ doublecomplex zdotc_(int *,
+	    doublecomplex *, int *, doublecomplex *, int *), zdotu_(
+	    int *, doublecomplex *, int *,
+	    doublecomplex *, int *);
+    extern /* Subroutine */ int dlabad_(double *, double *);
+    extern double dlamch_(char *, ftnlen);
+    static double scaloc;
+    extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
+    extern double zlange_(char *, int *, int *, doublecomplex *,
+	    int *, double *, ftnlen);
+    static double bignum;
+    extern /* Subroutine */ int zdscal_(int *, double *,
+	    doublecomplex *, int *);
+    /* Double Complex */ doublecomplex zladiv_(doublecomplex *,
+	     doublecomplex *);
+    static int notrna, notrnb;
+    static double smlnum;
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
+    notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
+    *info = 0;
+    if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
+	*info = -1;
+    } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
+	*info = -2;
+    } else if (*isgn != 1 && *isgn != -1) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,*m)) {
+	*info = -7;
+    } else if (*ldb < max(1,*n)) {
+	*info = -9;
+    } else if (*ldc < max(1,*m)) {
+	*info = -11;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZTRSY2", &i__1, (ftnlen)6);
+	return;
+    }
+    *scale = 1.;
+    if (*m == 0 || *n == 0) {
+	return;
+    }
+    eps = dlamch_("P", (ftnlen)1);
+    smlnum = dlamch_("S", (ftnlen)1);
+    bignum = 1. / smlnum;
+    dlabad_(&smlnum, &bignum);
+    smlnum = smlnum * (double) (*m * *n) / eps;
+    bignum = 1. / smlnum;
+/* Computing MAX */
+    d__1 = smlnum, d__2 = eps * zlange_("M", m, m, &a[a_offset], lda, dum, (
+	    ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * zlange_("M", n, n,
+	    &b[b_offset], ldb, dum, (ftnlen)1);
+    smin = max(d__1,d__2);
+    sgn = (double) (*isgn);
+    if (notrna && notrnb) {
+	i__1 = *n;
+	for (l = 1; l <= i__1; ++l) {
+	    for (k = *m; k >= 1; --k) {
+		i__2 = *m - k;
+/* Computing MIN */
+		i__3 = k + 1;
+/* Computing MIN */
+		i__4 = k + 1;
+		z__1 = zdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
+			min(i__4,*m) + l * c_dim1], &c__1);
+		suml.r = z__1.r, suml.i = z__1.i;
+		i__2 = l - 1;
+		z__1 = zdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
+			, &c__1);
+		sumr.r = z__1.r, sumr.i = z__1.i;
+		i__2 = k + l * c_dim1;
+		z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i;
+		z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
+		z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i;
+		vec.r = z__1.r, vec.i = z__1.i;
+		scaloc = 1.;
+		i__2 = k + k * a_dim1;
+		i__3 = l + l * b_dim1;
+		z__2.r = sgn * b[i__3].r, z__2.i = sgn * b[i__3].i;
+		z__1.r = a[i__2].r + z__2.r, z__1.i = a[i__2].i + z__2.i;
+		a11.r = z__1.r, a11.i = z__1.i;
+		da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
+			d__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
+			d__2));
+		if (da11 < 1. && db > 1.) {
+		    if (db > bignum * da11) {
+			scaloc = 1. / db;
+		    }
+		}
+		z__3.r = scaloc, z__3.i = 0.;
+		z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
+			z__3.i + vec.i * z__3.r;
+		z__1 = zladiv_(&z__2, &a11);
+		x11.r = z__1.r, x11.i = z__1.i;
+		if (scaloc != 1.) {
+		    i__2 = *n;
+		    for (j = 1; j <= i__2; ++j) {
+			zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L10: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__2 = k + l * c_dim1;
+		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
+/* L20: */
+	    }
+/* L30: */
+	}
+    } else if (! notrna && notrnb) {
+	i__1 = *n;
+	for (l = 1; l <= i__1; ++l) {
+	    i__2 = *m;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = k - 1;
+		z__1 = zdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
+			c_dim1 + 1], &c__1);
+		suml.r = z__1.r, suml.i = z__1.i;
+		i__3 = l - 1;
+		z__1 = zdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
+			, &c__1);
+		sumr.r = z__1.r, sumr.i = z__1.i;
+		i__3 = k + l * c_dim1;
+		z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i;
+		z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
+		z__1.r = c__[i__3].r - z__2.r, z__1.i = c__[i__3].i - z__2.i;
+		vec.r = z__1.r, vec.i = z__1.i;
+		scaloc = 1.;
+		d_cnjg(&z__2, &a[k + k * a_dim1]);
+		i__3 = l + l * b_dim1;
+		z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i;
+		z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		a11.r = z__1.r, a11.i = z__1.i;
+		da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
+			d__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
+			d__2));
+		if (da11 < 1. && db > 1.) {
+		    if (db > bignum * da11) {
+			scaloc = 1. / db;
+		    }
+		}
+		z__3.r = scaloc, z__3.i = 0.;
+		z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
+			z__3.i + vec.i * z__3.r;
+		z__1 = zladiv_(&z__2, &a11);
+		x11.r = z__1.r, x11.i = z__1.i;
+		if (scaloc != 1.) {
+		    i__3 = *n;
+		    for (j = 1; j <= i__3; ++j) {
+			zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L40: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__3 = k + l * c_dim1;
+		c__[i__3].r = x11.r, c__[i__3].i = x11.i;
+/* L50: */
+	    }
+/* L60: */
+	}
+    } else if (! notrna && ! notrnb) {
+	for (l = *n; l >= 1; --l) {
+	    i__1 = *m;
+	    for (k = 1; k <= i__1; ++k) {
+		i__2 = k - 1;
+		z__1 = zdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
+			c_dim1 + 1], &c__1);
+		suml.r = z__1.r, suml.i = z__1.i;
+		i__2 = *n - l;
+/* Computing MIN */
+		i__3 = l + 1;
+/* Computing MIN */
+		i__4 = l + 1;
+		z__1 = zdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
+			l + min(i__4,*n) * b_dim1], ldb);
+		sumr.r = z__1.r, sumr.i = z__1.i;
+		i__2 = k + l * c_dim1;
+		d_cnjg(&z__4, &sumr);
+		z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i;
+		z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
+		z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i;
+		vec.r = z__1.r, vec.i = z__1.i;
+		scaloc = 1.;
+		i__2 = k + k * a_dim1;
+		i__3 = l + l * b_dim1;
+		z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i;
+		z__2.r = a[i__2].r + z__3.r, z__2.i = a[i__2].i + z__3.i;
+		d_cnjg(&z__1, &z__2);
+		a11.r = z__1.r, a11.i = z__1.i;
+		da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
+			d__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
+			d__2));
+		if (da11 < 1. && db > 1.) {
+		    if (db > bignum * da11) {
+			scaloc = 1. / db;
+		    }
+		}
+		z__3.r = scaloc, z__3.i = 0.;
+		z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
+			z__3.i + vec.i * z__3.r;
+		z__1 = zladiv_(&z__2, &a11);
+		x11.r = z__1.r, x11.i = z__1.i;
+		if (scaloc != 1.) {
+		    i__2 = *n;
+		    for (j = 1; j <= i__2; ++j) {
+			zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L70: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__2 = k + l * c_dim1;
+		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
+/* L80: */
+	    }
+/* L90: */
+	}
+    } else if (notrna && ! notrnb) {
+	for (l = *n; l >= 1; --l) {
+	    for (k = *m; k >= 1; --k) {
+		i__1 = *m - k;
+/* Computing MIN */
+		i__2 = k + 1;
+/* Computing MIN */
+		i__3 = k + 1;
+		z__1 = zdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
+			min(i__3,*m) + l * c_dim1], &c__1);
+		suml.r = z__1.r, suml.i = z__1.i;
+		i__1 = *n - l;
+/* Computing MIN */
+		i__2 = l + 1;
+/* Computing MIN */
+		i__3 = l + 1;
+		z__1 = zdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
+			l + min(i__3,*n) * b_dim1], ldb);
+		sumr.r = z__1.r, sumr.i = z__1.i;
+		i__1 = k + l * c_dim1;
+		d_cnjg(&z__4, &sumr);
+		z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i;
+		z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
+		z__1.r = c__[i__1].r - z__2.r, z__1.i = c__[i__1].i - z__2.i;
+		vec.r = z__1.r, vec.i = z__1.i;
+		scaloc = 1.;
+		i__1 = k + k * a_dim1;
+		d_cnjg(&z__3, &b[l + l * b_dim1]);
+		z__2.r = sgn * z__3.r, z__2.i = sgn * z__3.i;
+		z__1.r = a[i__1].r + z__2.r, z__1.i = a[i__1].i + z__2.i;
+		a11.r = z__1.r, a11.i = z__1.i;
+		da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
+			d__2));
+		if (da11 <= smin) {
+		    a11.r = smin, a11.i = 0.;
+		    da11 = smin;
+		    *info = 1;
+		}
+		db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
+			d__2));
+		if (da11 < 1. && db > 1.) {
+		    if (db > bignum * da11) {
+			scaloc = 1. / db;
+		    }
+		}
+		z__3.r = scaloc, z__3.i = 0.;
+		z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
+			z__3.i + vec.i * z__3.r;
+		z__1 = zladiv_(&z__2, &a11);
+		x11.r = z__1.r, x11.i = z__1.i;
+		if (scaloc != 1.) {
+		    i__1 = *n;
+		    for (j = 1; j <= i__1; ++j) {
+			zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L100: */
+		    }
+		    *scale *= scaloc;
+		}
+		i__1 = k + l * c_dim1;
+		c__[i__1].r = x11.r, c__[i__1].i = x11.i;
+/* L110: */
+	    }
+/* L120: */
+	}
+    }
+    return;
+}
diff --git a/relapack/src/ztrtri.c b/relapack/src/ztrtri.c
new file mode 100644
index 000000000..ac9fe7bd4
--- /dev/null
+++ b/relapack/src/ztrtri.c
@@ -0,0 +1,107 @@
+#include "relapack.h"
+
+static void RELAPACK_ztrtri_rec(const char *, const char *, const int *,
+    double *, const int *, int *);
+
+
+/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's ztrtri.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html
+ * */
+void RELAPACK_ztrtri(
+    const char *uplo, const char *diag, const int *n,
+    double *A, const int *ldA,
+    int *info
+) {
+
+    // Check arguments
+    const int lower = LAPACK(lsame)(uplo, "L");
+    const int upper = LAPACK(lsame)(uplo, "U");
+    const int nounit = LAPACK(lsame)(diag, "N");
+    const int unit = LAPACK(lsame)(diag, "U");
+    *info = 0;
+    if (!lower && !upper)
+        *info = -1;
+    else if (!nounit && !unit)
+        *info = -2;
+    else if (*n < 0)
+        *info = -3;
+    else if (*ldA < MAX(1, *n))
+        *info = -5;
+    if (*info) {
+        const int minfo = -*info;
+        LAPACK(xerbla)("ZTRTRI", &minfo);
+        return;
+    }
+
+    // Clean char * arguments
+    const char cleanuplo = lower  ? 'L' : 'U';
+    const char cleandiag = nounit ? 'N' : 'U';
+
+    // check for singularity
+    if (nounit) {
+        int i;
+        for (i = 0; i < *n; i++)
+            if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
+                *info = i;
+                return;
+            }
+    }
+
+    // Recursive kernel
+    RELAPACK_ztrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
+}
+
+
+/** ztrtri's recursive compute kernel */
+static void RELAPACK_ztrtri_rec(
+    const char *uplo, const char *diag, const int *n,
+    double *A, const int *ldA,
+    int *info
+){
+
+    if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) {
+        // Unblocked
+        LAPACK(ztrti2)(uplo, diag, n, A, ldA, info);
+        return;
+    }
+
+    // Constants
+    const double ONE[]  = { 1. };
+    const double MONE[] = { -1. };
+
+    // Splitting
+    const int n1 = ZREC_SPLIT(*n);
+    const int n2 = *n - n1;
+
+    // A_TL A_TR
+    // A_BL A_BR
+    double *const A_TL = A;
+    double *const A_TR = A + 2 * *ldA * n1;
+    double *const A_BL = A                 + 2 * n1;
+    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+    // recursion(A_TL)
+    RELAPACK_ztrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
+    if (*info)
+        return;
+
+    if (*uplo == 'L') {
+        // A_BL = - A_BL * A_TL
+        BLAS(ztrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
+        // A_BL = A_BR \ A_BL
+        BLAS(ztrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+    } else {
+        // A_TR = - A_TL * A_TR
+        BLAS(ztrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
+        // A_TR = A_TR / A_BR
+        BLAS(ztrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+    }
+
+    // recursion(A_BR)
+    RELAPACK_ztrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
+    if (*info)
+        *info += n1;
+}

From b122413fb01aacff3112fdb387e74126539a8e5f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 28 Jun 2017 18:13:14 +0200
Subject: [PATCH 12/42] Restore ReLAPACK test folder

---
 relapack/test/README.md |  48 ++++++++++++++
 relapack/test/config.h  |  13 ++++
 relapack/test/lapack.h  |  64 +++++++++++++++++++
 relapack/test/test.h    | 136 ++++++++++++++++++++++++++++++++++++++++
 relapack/test/util.c    | 116 ++++++++++++++++++++++++++++++++++
 relapack/test/util.h    |  15 +++++
 relapack/test/xgbtrf.c  |  43 +++++++++++++
 relapack/test/xgemmt.c  |  65 +++++++++++++++++++
 relapack/test/xgetrf.c  |  32 ++++++++++
 relapack/test/xhegst.c  |  32 ++++++++++
 relapack/test/xhetrf.c  |  40 ++++++++++++
 relapack/test/xlauum.c  |  25 ++++++++
 relapack/test/xpbtrf.c  |  40 ++++++++++++
 relapack/test/xpotrf.c  |  25 ++++++++
 relapack/test/xsygst.c  |  32 ++++++++++
 relapack/test/xsytrf.c  |  40 ++++++++++++
 relapack/test/xtgsyl.c  |  94 +++++++++++++++++++++++++++
 relapack/test/xtrsyl.c  |  65 +++++++++++++++++++
 relapack/test/xtrtri.c  |  25 ++++++++
 19 files changed, 950 insertions(+)
 create mode 100644 relapack/test/README.md
 create mode 100644 relapack/test/config.h
 create mode 100644 relapack/test/lapack.h
 create mode 100644 relapack/test/test.h
 create mode 100644 relapack/test/util.c
 create mode 100644 relapack/test/util.h
 create mode 100644 relapack/test/xgbtrf.c
 create mode 100644 relapack/test/xgemmt.c
 create mode 100644 relapack/test/xgetrf.c
 create mode 100644 relapack/test/xhegst.c
 create mode 100644 relapack/test/xhetrf.c
 create mode 100644 relapack/test/xlauum.c
 create mode 100644 relapack/test/xpbtrf.c
 create mode 100644 relapack/test/xpotrf.c
 create mode 100644 relapack/test/xsygst.c
 create mode 100644 relapack/test/xsytrf.c
 create mode 100644 relapack/test/xtgsyl.c
 create mode 100644 relapack/test/xtrsyl.c
 create mode 100644 relapack/test/xtrtri.c

diff --git a/relapack/test/README.md b/relapack/test/README.md
new file mode 100644
index 000000000..48434b3cf
--- /dev/null
+++ b/relapack/test/README.md
@@ -0,0 +1,48 @@
+ReLAPACK Test Suite
+===================
+This test suite compares ReLAPACK's recursive routines with LAPACK's compute
+routines in terms of accuracy:  For each test-case, we execute both ReLAPACK's
+and LAPACK's routine on the same data and consider the numerical difference
+between the two solutions.
+
+This difference is computed as the maximum error across all elements of the
+routine's outputs, where the error for each element is the minimum of the
+absolute error and the relative error (with LAPACK as the reference).  If the
+error is below the error bound configured in `config.h` (default: 1e-5 for
+single precision and 1e-14 for double precision) the test-case is considered as
+passed.
+
+For each routine the test-cases cover a variety of input argument combinations
+to ensure that ReLAPACK's routines match the functionality of LAPACK for all use
+cases.
+
+The matrix size for all experiments (default: 100) can also be specified in
+`config.h`.
+
+
+Implementation
+--------------
+`test.h` provides the framework for our tests:  It provides macros that allow to
+generalize the tests for each operation in one file covering all data-types.
+Such a file is structured as follows:
+
+ * All matrices required by the test-cases are declared globally.  For each
+   matrix, an array of two pointers is declared; one for the matrix copy passed
+   to ReLAPACK and one passed to LAPACK.
+
+ * `tests()` contains the main control flow: it allocates (and later frees) the
+   copies of the globally declared matrices.  It then defines the macro
+   `ROUTINE` to contain the name of the currently tested routine.
+   It then uses the macro `TEST` to perform the test-cases.
+   It receives the arguments of the routine, where matrices of which ReLAPACK
+   and LAPACK receive a copy are index with `i`. (Example: `TEST("L", &n, A[i],
+   &n, info);`)
+
+ * The macro `TEST` first calls `pre()`, which initializes all relevant
+   matrices, then executes the ReLAPACK algorithm on the matrices with `i` = `0`
+   and then the LAPACK counter part with `i` = `1`.  It then calls `post()`,
+   which computes the difference between the results, storing it in `error`.
+   Finally, the error is printed out and compared to the error bound.
+
+If all test-cases pass the error bound test, the program will have a `0` return
+value, otherwise it is `1`, indicating an error.
diff --git a/relapack/test/config.h b/relapack/test/config.h
new file mode 100644
index 000000000..ab06a2fff
--- /dev/null
+++ b/relapack/test/config.h
@@ -0,0 +1,13 @@
+#ifndef TEST_CONFIG_H
+#define TEST_CONFIG_H
+
+// error bound for single and single complex routines
+#define SINGLE_ERR_BOUND 1e-4
+
+// error bound for double an double complex routines
+#define DOUBLE_ERR_BOUND 1e-13
+
+// size of test matrices
+#define TEST_SIZE 100
+
+#endif /* TEST_CONFIG_H */
diff --git a/relapack/test/lapack.h b/relapack/test/lapack.h
new file mode 100644
index 000000000..80f5c419e
--- /dev/null
+++ b/relapack/test/lapack.h
@@ -0,0 +1,64 @@
+#ifndef LAPACK_H2
+#define LAPACK_H2
+
+#include "../config.h"
+
+void LAPACK(slauum)(const char *, const int *, float *, const int *, int *);
+void LAPACK(dlauum)(const char *, const int *, double *, const int *, int *);
+void LAPACK(clauum)(const char *, const int *, float *, const int *, int *);
+void LAPACK(zlauum)(const char *, const int *, double *, const int *, int *);
+
+void LAPACK(strtri)(const char *, const char *, const int *, float *, const int *, int *);
+void LAPACK(dtrtri)(const char *, const char *, const int *, double *, const int *, int *);
+void LAPACK(ctrtri)(const char *, const char *, const int *, float *, const int *, int *);
+void LAPACK(ztrtri)(const char *, const char *, const int *, double *, const int *, int *);
+
+void LAPACK(spotrf)(const char *, const int *, float *, const int *, int *);
+void LAPACK(dpotrf)(const char *, const int *, double *, const int *, int *);
+void LAPACK(cpotrf)(const char *, const int *, float *, const int *, int *);
+void LAPACK(zpotrf)(const char *, const int *, double *, const int *, int *);
+
+void LAPACK(spbtrf)(const char *, const int *, const int *, float *, const int *, int *);
+void LAPACK(dpbtrf)(const char *, const int *, const int *, double *, const int *, int *);
+void LAPACK(cpbtrf)(const char *, const int *, const int *, float *, const int *, int *);
+void LAPACK(zpbtrf)(const char *, const int *, const int *, double *, const int *, int *);
+
+void LAPACK(ssytrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(dsytrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void LAPACK(csytrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(chetrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(zsytrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void LAPACK(zhetrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void LAPACK(ssytrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(dsytrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void LAPACK(csytrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(chetrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(zsytrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void LAPACK(zhetrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+
+void LAPACK(sgetrf)(const int *, const int *, float *, const int *, int *, int *);
+void LAPACK(dgetrf)(const int *, const int *, double *, const int *, int *, int *);
+void LAPACK(cgetrf)(const int *, const int *, float *, const int *, int *, int *);
+void LAPACK(zgetrf)(const int *, const int *, double *, const int *, int *, int *);
+
+void LAPACK(sgbtrf)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+void LAPACK(dgbtrf)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+void LAPACK(cgbtrf)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+void LAPACK(zgbtrf)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+
+void LAPACK(ssygst)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+void LAPACK(dsygst)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+void LAPACK(chegst)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+void LAPACK(zhegst)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+
+void LAPACK(strsyl)(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void LAPACK(dtrsyl)(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+void LAPACK(ctrsyl)(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void LAPACK(ztrsyl)(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+
+void LAPACK(stgsyl)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
+void LAPACK(dtgsyl)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+void LAPACK(ctgsyl)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
+void LAPACK(ztgsyl)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+
+#endif /*  LAPACK_H2 */
diff --git a/relapack/test/test.h b/relapack/test/test.h
new file mode 100644
index 000000000..24089f3a8
--- /dev/null
+++ b/relapack/test/test.h
@@ -0,0 +1,136 @@
+#ifndef TEST_H
+#define TEST_H
+
+#include "../config.h"
+#include "config.h"
+
+#if BLAS_UNDERSCORE
+#define BLAS(routine) routine ## _
+#else
+#define BLAS(routine) routine
+#endif
+
+#if LAPACK_UNDERSCORE
+#define LAPACK(routine) routine ## _
+#else
+#define LAPACK(routine) routine
+#endif
+
+#include "../inc/relapack.h"
+#include "lapack.h"
+#include "util.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// some name mangling macros
+#define CAT(A, B) A ## B
+#define XCAT(A, B) CAT(A, B)
+#define XLAPACK(X) LAPACK(X)
+#define XRELAPACK(X) XCAT(RELAPACK_, X)
+#define STR(X) #X
+#define XSTR(X) STR(X)
+
+// default setup and error computation names: pre() and post()
+#define PRE pre
+#define POST post
+
+// TEST macro:
+// run setup (pre()), ReLAPACK routine (i = 0), LAPACK routine (i = 1), compute
+// error (post()), check error bound, and  print setup and error
+#define TEST(...) \
+    PRE(); \
+    i = 0; \
+    XRELAPACK(ROUTINE)(__VA_ARGS__); \
+    i = 1; \
+    XLAPACK(ROUTINE)(__VA_ARGS__); \
+    POST(); \
+    fail |= error > ERR_BOUND; \
+    printf("%s(%s)\t%g\n", XSTR(ROUTINE), #__VA_ARGS__, error);
+
+// generalized datatype treatment: DT_PREFIX determines the type s, d, c, or z
+#define XPREF(A) XCAT(DT_PREFIX, A)
+
+// matrix generation and error computation routines
+#define x2matgen XPREF(2matgen)
+#define x2vecerr XPREF(2vecerr)
+
+// error bounds
+#define ERR_BOUND XPREF(ERR_BOUND_)
+#define sERR_BOUND_ SINGLE_ERR_BOUND
+#define dERR_BOUND_ DOUBLE_ERR_BOUND
+#define cERR_BOUND_ SINGLE_ERR_BOUND
+#define zERR_BOUND_ DOUBLE_ERR_BOUND
+
+// C datatypes
+#define datatype XPREF(datatype_)
+#define sdatatype_ float
+#define ddatatype_ double
+#define cdatatype_ float
+#define zdatatype_ double
+
+// number of C datatype elements per element
+#define x1 XPREF(DT_MULT)
+#define sDT_MULT 1
+#define dDT_MULT 1
+#define cDT_MULT 2
+#define zDT_MULT 2
+
+// typed allocations
+#define xmalloc XPREF(malloc)
+#define imalloc(S) malloc((S) * sizeof(int))
+#define smalloc(S) malloc((S) * sizeof(float))
+#define dmalloc(S) malloc((S) * sizeof(double))
+#define cmalloc(S) malloc((S) * 2 * sizeof(float))
+#define zmalloc(S) malloc((S) * 2 * sizeof(double))
+
+// transpositions
+#define xCTRANS XPREF(CTRANS)
+#define sCTRANS "T"
+#define dCTRANS "T"
+#define cCTRANS "C"
+#define zCTRANS "C"
+
+// some constants
+#define MONE XPREF(MONE)
+const float  sMONE[] = { -1. };
+const double dMONE[] = { -1. };
+const float  cMONE[] = { -1., 0. };
+const double zMONE[] = { -1., 0. };
+
+#define ZERO XPREF(ZERO)
+const float  sZERO[] = { 0. };
+const double dZERO[] = { 0. };
+const float  cZERO[] = { 0., 0. };
+const double zZERO[] = { 0., 0. };
+
+#define ONE  XPREF(ONE)
+const float  sONE[]  = { 1. };
+const double dONE[]  = { 1. };
+const float  cONE[]  = { 1., 0. };
+const double zONE[]  = { 1., 0. };
+
+const int iMONE[]  = { -1 };
+const int iZERO[]  = { 0 };
+const int iONE[]   = { 1 };
+const int iTWO[]   = { 2 };
+const int iTHREE[] = { 3 };
+const int iFOUR[]  = { 4 };
+
+void tests();
+
+// global variables (used in tests(), pre(), and post())
+int i, n, n2, fail;
+double error;
+
+int main(int argc, char* argv[]) {
+    n = TEST_SIZE;
+    n2 = (3 * n) / 4;
+    fail = 0;
+
+    tests();
+
+    return fail;
+}
+
+#endif /* TEST_H */
diff --git a/relapack/test/util.c b/relapack/test/util.c
new file mode 100644
index 000000000..e0fca3eec
--- /dev/null
+++ b/relapack/test/util.c
@@ -0,0 +1,116 @@
+#include "util.h"
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+///////////////////////
+// matrix generation //
+///////////////////////
+// Each routine x2matgen is passed the size (m, n) of the desired matrix and
+// geneartes two copies of such a matrix in in its output arguments A and B.
+// The generated matrices is filled with random entries in [0, 1[ (+i*[0, 1[ in
+// the complex case).  Then m is added to the diagonal; this is numerically
+// favorable for routines working with triangular and symmetric matrices.  For
+// the same reason the imaginary part of the diagonal is set to 0.
+
+void s2matgen(const int m, const int n, float *A, float *B) {
+    srand(time(NULL) + (size_t) A);
+    int i, j;
+    for (i = 0; i < m; i++)
+        for (j = 0; j < n; j++)
+            A[i + m * j] = B[i + m * j] = (float) rand() / RAND_MAX + m * (i == j);
+}
+
+void d2matgen(const int m, const int n, double *A, double *B) {
+    srand(time(NULL) + (size_t) A);
+    int i, j;
+    for (i = 0; i < m; i++)
+        for (j = 0; j < n; j++)
+            A[i + m * j] = B[i + m * j] = (double) rand() / RAND_MAX + m * (i == j);
+}
+
+void c2matgen(const int m, const int n, float *A, float *B) {
+    srand(time(NULL) + (size_t) A);
+    int i, j;
+    for (i = 0; i < m; i++)
+        for (j = 0; j < n; j++) {
+            A[2* (i + m * j)]     = B[2 * (i + m * j)]     = (float) rand() / RAND_MAX + m * (i == j);
+            A[2* (i + m * j) + 1] = B[2 * (i + m * j) + 1] = ((float) rand() / RAND_MAX) * (i != j);
+        }
+}
+
+void z2matgen(const int m, const int n, double *A, double *B) {
+    srand(time(NULL) + (size_t) A);
+    int i, j;
+    for (i = 0; i < m; i++)
+        for (j = 0; j < n; j++) {
+            A[2* (i + m * j)]     = B[2 * (i + m * j)]     = (double) rand() / RAND_MAX + m * (i == j);
+            A[2* (i + m * j) + 1] = B[2 * (i + m * j) + 1] = ((double) rand() / RAND_MAX) * (i != j);
+        }
+}
+
+////////////////////////
+// error computations //
+////////////////////////
+// Each routine x2vecerrr is passed a vector lengh n and two vectors x and y.
+// It returns the maximum of the element-wise error between these two vectors.
+// This error is the minimum of the absolute difference and the relative
+// differene with respect to y.
+
+double i2vecerr(const int n, const int *x, const int *y) {
+    double error = 0;
+    int i;
+    for (i = 0; i < n; i++) {
+        double nom = abs(x[i] - y[i]);
+        double den = abs(y[i]);
+        error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom);
+    }
+    return error;
+}
+
+double s2vecerr(const int n, const float *x, const float *y) {
+    float error = 0;
+    int i;
+    for (i = 0; i < n; i++) {
+        double nom = fabs((double) x[i] - y[i]);
+        double den = fabs(y[i]);
+        error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom);
+    }
+    return error;
+}
+
+double d2vecerr(const int n, const double *x, const double *y) {
+    double error = 0;
+    int i;
+    for (i = 0; i < n; i++) {
+        double nom = fabs(x[i] - y[i]);
+        double den = fabs(y[i]);
+        error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom);
+    }
+    return error;
+}
+
+double c2vecerr(const int n, const float *x, const float *y) {
+    double error = 0;
+    int i;
+    for (i = 0; i < n; i++) {
+        double nom = sqrt(((double) x[2 * i] - y[2 * i]) * ((double) x[2 * i] - y[2 * i]) + ((double) x[2 * i + 1] - y[2 * i + 1]) * ((double) x[2 * i + 1] - y[2 * i + 1]));
+        double den = sqrt((double) y[2 * i] * y[2 * i] + (double) y[2 * i + 1] * y[2 * i + 1]);
+        error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom);
+    }
+    return error;
+}
+
+double z2vecerr(const int n, const double *x, const double *y) {
+    double error = 0;
+    int i;
+    for (i = 0; i < n; i++) {
+        double nom = sqrt((x[2 * i] - y[2 * i]) * (x[2 * i] - y[2 * i]) + (x[2 * i + 1] - y[2 * i + 1]) * (x[2 * i + 1] - y[2 * i + 1]));
+        double den = sqrt(y[2 * i] * y[2 * i] + y[2 * i + 1] * y[2 * i + 1]);
+        error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom);
+    }
+    return error;
+}
diff --git a/relapack/test/util.h b/relapack/test/util.h
new file mode 100644
index 000000000..11d2999e0
--- /dev/null
+++ b/relapack/test/util.h
@@ -0,0 +1,15 @@
+#ifndef TEST_UTIL_H
+#define TEST_UTIL_H
+
+void s2matgen(int, int, float *, float *);
+void d2matgen(int, int, double *, double *);
+void c2matgen(int, int, float *, float *);
+void z2matgen(int, int, double *, double *);
+
+double i2vecerr(int, const int *, const int *);
+double s2vecerr(int, const float *, const float *);
+double d2vecerr(int, const double *, const double *);
+double c2vecerr(int, const float *, const float *);
+double z2vecerr(int, const double *, const double *);
+
+#endif /* TEST_UTIL_H */
diff --git a/relapack/test/xgbtrf.c b/relapack/test/xgbtrf.c
new file mode 100644
index 000000000..f255006a5
--- /dev/null
+++ b/relapack/test/xgbtrf.c
@@ -0,0 +1,43 @@
+#include "test.h"
+
+datatype *A[2];
+int *ipiv[2], info;
+int kl, ku, ld;
+
+void pre() {
+    int i;
+    x2matgen(ld, n, A[0], A[1]);
+    for (i = 0; i < n; i++) {
+        // set diagonal
+        A[0][x1 * (i + ld * i)] =
+        A[1][x1 * (i + ld * i)] = (datatype) rand() / RAND_MAX;
+    }
+    memset(ipiv[0], 0, n * sizeof(int));
+    memset(ipiv[1], 0, n * sizeof(int));
+}
+
+void post() {
+    error = x2vecerr(ld * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]);
+}
+
+void tests() {
+    kl = n - 10;
+    ku = n;
+    ld = 2 * kl + ku + 1;
+
+    A[0] = xmalloc(ld * n);
+    A[1] = xmalloc(ld * n);
+    ipiv[0] = imalloc(n);
+    ipiv[1] = imalloc(n);
+
+    #define ROUTINE XPREF(gbtrf)
+
+    TEST(&n, &n, &kl, &ku, A[i], &ld, ipiv[i], &info);
+    TEST(&n, &n2, &kl, &ku, A[i], &ld, ipiv[i], &info);
+    TEST(&n2, &n, &kl, &ku, A[i], &ld, ipiv[i], &info);
+
+    free(A[0]);
+    free(A[1]);
+    free(ipiv[0]);
+    free(ipiv[1]);
+}
diff --git a/relapack/test/xgemmt.c b/relapack/test/xgemmt.c
new file mode 100644
index 000000000..ffc37049d
--- /dev/null
+++ b/relapack/test/xgemmt.c
@@ -0,0 +1,65 @@
+#include "test.h"
+
+datatype *A[2], *B[2], *C[2], *Ctmp;
+int info;
+
+void pre() {
+    x2matgen(n, n, A[0], A[1]);
+    x2matgen(n, n, B[0], B[1]);
+    x2matgen(n, n, C[0], C[1]);
+}
+
+void post() {
+    error = x2vecerr(n * n, C[0], C[1]);
+}
+
+#define ROUTINE XPREF(gemmt)
+
+#define xlacpy XPREF(LAPACK(lacpy))
+#define xgemm XPREF(BLAS(gemm))
+
+extern void xlacpy(const char *, const int *, const int *, const datatype *, const int *, datatype *, const int *);
+extern void xgemm(const char *, const char *, const int *, const int *, const int *, const datatype *, const datatype *, const int *, const datatype *, const int *, const datatype *, const datatype *, const int*);
+
+void XLAPACK(ROUTINE)(
+    const char *uplo, const char *transA, const char *transB,
+    const int *n, const int *k,
+    const datatype *alpha, const datatype *A, const int *ldA,
+    const datatype *B, const int *ldB,
+    const datatype *beta, datatype *C, const int *ldC
+) {
+    xlacpy(uplo, n, n, C, ldC, Ctmp, n);
+    xgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, Ctmp, n);
+    xlacpy(uplo, n, n, Ctmp, ldC, C, n);
+}
+
+void tests() {
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+    B[0] = xmalloc(n * n);
+    B[1] = xmalloc(n * n);
+    C[0] = xmalloc(n * n);
+    C[1] = xmalloc(n * n);
+    Ctmp = xmalloc(n * n);
+
+    TEST("L", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+    TEST("L", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, MONE, C[i], &n);
+    TEST("L", "N", "N", &n, &n, MONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+    TEST("L", "N", "T", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+    TEST("L", "T", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+    TEST("L", "N", "N", &n, &n2, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+    TEST("U", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+    TEST("U", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, MONE, C[i], &n);
+    TEST("U", "N", "N", &n, &n, MONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+    TEST("U", "N", "T", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+    TEST("U", "T", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+    TEST("U", "N", "N", &n, &n2, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+
+    free(A[0]);
+    free(A[1]);
+    free(B[0]);
+    free(B[1]);
+    free(C[0]);
+    free(C[1]);
+    free(Ctmp);
+}
diff --git a/relapack/test/xgetrf.c b/relapack/test/xgetrf.c
new file mode 100644
index 000000000..4484a24af
--- /dev/null
+++ b/relapack/test/xgetrf.c
@@ -0,0 +1,32 @@
+#include "test.h"
+
+datatype *A[2];
+int *ipiv[2], info;
+
+void pre() {
+    x2matgen(n, n, A[0], A[1]);
+    memset(ipiv[0], 0, n * sizeof(int));
+    memset(ipiv[1], 0, n * sizeof(int));
+}
+
+void post() {
+    error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]);
+}
+
+void tests() {
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+    ipiv[0] = imalloc(n);
+    ipiv[1] = imalloc(n);
+
+    #define ROUTINE XPREF(getrf)
+
+    TEST(&n, &n, A[i], &n, ipiv[i], &info);
+    TEST(&n, &n2, A[i], &n, ipiv[i], &info);
+    TEST(&n2, &n, A[i], &n, ipiv[i], &info);
+
+    free(A[0]);
+    free(A[1]);
+    free(ipiv[0]);
+    free(ipiv[1]);
+}
diff --git a/relapack/test/xhegst.c b/relapack/test/xhegst.c
new file mode 100644
index 000000000..c318ef546
--- /dev/null
+++ b/relapack/test/xhegst.c
@@ -0,0 +1,32 @@
+#include "test.h"
+
+datatype *A[2], *B[2];
+int info;
+
+void pre() {
+    x2matgen(n, n, A[0], A[1]);
+    x2matgen(n, n, B[0], B[1]);
+}
+
+void post() {
+    error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+    B[0] = xmalloc(n * n);
+    B[1] = xmalloc(n * n);
+
+    #define ROUTINE XPREF(hegst)
+
+    TEST(iONE, "L", &n, A[i], &n, B[i], &n, &info);
+    TEST(iONE, "U", &n, A[i], &n, B[i], &n, &info);
+    TEST(iTWO, "L", &n, A[i], &n, B[i], &n, &info);
+    TEST(iTWO, "U", &n, A[i], &n, B[i], &n, &info);
+
+    free(A[0]);
+    free(A[1]);
+    free(B[0]);
+    free(B[1]);
+}
diff --git a/relapack/test/xhetrf.c b/relapack/test/xhetrf.c
new file mode 100644
index 000000000..b5d54bdff
--- /dev/null
+++ b/relapack/test/xhetrf.c
@@ -0,0 +1,40 @@
+#include "test.h"
+
+datatype *A[2], *Work;
+int *ipiv[2], info;
+
+void pre() {
+    x2matgen(n, n, A[0], A[1]);
+    memset(ipiv[0], 0, n * sizeof(int));
+    memset(ipiv[1], 0, n * sizeof(int));
+}
+
+void post() {
+    error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]);
+}
+
+void tests() {
+    const int lWork = n * n;
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+    ipiv[0] = imalloc(n);
+    ipiv[1] = imalloc(n);
+    Work = xmalloc(lWork);
+
+    #define ROUTINE XPREF(hetrf)
+
+    TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+    TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+
+    #undef ROUTINE
+    #define ROUTINE XPREF(hetrf_rook)
+
+    TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+    TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+
+    free(A[0]);
+    free(A[1]);
+    free(ipiv[0]);
+    free(ipiv[1]);
+    free(Work);
+}
diff --git a/relapack/test/xlauum.c b/relapack/test/xlauum.c
new file mode 100644
index 000000000..d2c42fa01
--- /dev/null
+++ b/relapack/test/xlauum.c
@@ -0,0 +1,25 @@
+#include "test.h"
+
+datatype *A[2];
+int info;
+
+void pre() {
+    x2matgen(n, n, A[0], A[1]);
+}
+
+void post() {
+    error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+
+    #define ROUTINE XPREF(lauum)
+
+    TEST("L", &n, A[i], &n, &info);
+    TEST("U", &n, A[i], &n, &info);
+
+    free(A[0]);
+    free(A[1]);
+}
diff --git a/relapack/test/xpbtrf.c b/relapack/test/xpbtrf.c
new file mode 100644
index 000000000..9a9babb6b
--- /dev/null
+++ b/relapack/test/xpbtrf.c
@@ -0,0 +1,40 @@
+#include "test.h"
+
+datatype *A[2];
+int info[2];
+int n;
+
+void pre() {
+    int i;
+    x2matgen(n, n, A[0], A[1]);
+    for (i = 0; i < n; i++) {
+        // set diagonal
+        A[0][x1 * (i + n * i)] =
+        A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX;
+        // set first row
+        A[0][x1 * (n * i)] =
+        A[1][x1 * (n * i)] = (datatype) rand() / RAND_MAX + n;
+    }
+}
+
+void post() {
+    error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+
+    #define ROUTINE XPREF(pbtrf)
+
+    const int
+        kd1 = n / 4,
+        kd2 = n * 3 / 4;
+    TEST("L", &n, &kd1, A[i], &n, &info[i]);
+    TEST("L", &n, &kd2, A[i], &n, &info[i]);
+    TEST("U", &n, &kd1, A[i] - x1 * kd1, &n, &info[i]);
+    TEST("U", &n, &kd2, A[i] - x1 * kd2, &n, &info[i]);
+
+    free(A[0]);
+    free(A[1]);
+}
diff --git a/relapack/test/xpotrf.c b/relapack/test/xpotrf.c
new file mode 100644
index 000000000..5e04d426f
--- /dev/null
+++ b/relapack/test/xpotrf.c
@@ -0,0 +1,25 @@
+#include "test.h"
+
+datatype *A[2];
+int info;
+
+void pre() {
+    x2matgen(n, n, A[0], A[1]);
+}
+
+void post() {
+    error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+
+    #define ROUTINE XPREF(potrf)
+
+    TEST("L", &n, A[i], &n, &info);
+    TEST("U", &n, A[i], &n, &info);
+
+    free(A[0]);
+    free(A[1]);
+}
diff --git a/relapack/test/xsygst.c b/relapack/test/xsygst.c
new file mode 100644
index 000000000..b473a5919
--- /dev/null
+++ b/relapack/test/xsygst.c
@@ -0,0 +1,32 @@
+#include "test.h"
+
+datatype *A[2], *B[2];
+int info;
+
+void pre() {
+    x2matgen(n, n, A[0], A[1]);
+    x2matgen(n, n, B[0], B[1]);
+}
+
+void post() {
+    error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+    B[0] = xmalloc(n * n);
+    B[1] = xmalloc(n * n);
+
+    #define ROUTINE XPREF(sygst)
+
+    TEST(iONE, "L", &n, A[i], &n, B[i], &n, &info);
+    TEST(iONE, "U", &n, A[i], &n, B[i], &n, &info);
+    TEST(iTWO, "L", &n, A[i], &n, B[i], &n, &info);
+    TEST(iTWO, "U", &n, A[i], &n, B[i], &n, &info);
+
+    free(A[0]);
+    free(A[1]);
+    free(B[0]);
+    free(B[1]);
+}
diff --git a/relapack/test/xsytrf.c b/relapack/test/xsytrf.c
new file mode 100644
index 000000000..82d626f6f
--- /dev/null
+++ b/relapack/test/xsytrf.c
@@ -0,0 +1,40 @@
+#include "test.h"
+
+datatype *A[2], *Work;
+int *ipiv[2], info;
+
+void pre() {
+    x2matgen(n, n, A[0], A[1]);
+    memset(ipiv[0], 0, n * sizeof(int));
+    memset(ipiv[1], 0, n * sizeof(int));
+}
+
+void post() {
+    error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]);
+}
+
+void tests() {
+    const int lWork = n * n;
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+    ipiv[0] = imalloc(n);
+    ipiv[1] = imalloc(n);
+    Work = xmalloc(lWork);
+
+    #define ROUTINE XPREF(sytrf)
+
+    TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+    TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+
+    #undef ROUTINE
+    #define ROUTINE XPREF(sytrf_rook)
+
+    TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+    TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+
+    free(A[0]);
+    free(A[1]);
+    free(ipiv[0]);
+    free(ipiv[1]);
+    free(Work);
+}
diff --git a/relapack/test/xtgsyl.c b/relapack/test/xtgsyl.c
new file mode 100644
index 000000000..74db5005e
--- /dev/null
+++ b/relapack/test/xtgsyl.c
@@ -0,0 +1,94 @@
+#include "test.h"
+
+datatype *A[2], *B[2], *C[2], *D[2], *E[2], *F[2], *Work, scale[2], dif[2];
+int *iWork, lWork, info;
+
+#define xlascl XPREF(LAPACK(lascl))
+void xlascl(const char *, const int *, const int *, const datatype *, const
+        datatype *, const int *, const int *, datatype *, const int *, int *);
+
+#define xscal XPREF(LAPACK(scal))
+void xscal(const int *, const datatype *, datatype *, const int *);
+
+void pre() {
+    int i;
+
+    x2matgen(n, n, A[0], A[1]);
+    x2matgen(n, n, B[0], B[1]);
+    x2matgen(n, n, C[0], C[1]);
+    x2matgen(n, n, D[0], D[1]);
+    x2matgen(n, n, E[0], E[1]);
+    x2matgen(n, n, F[0], F[1]);
+
+    for (i = 0; i < n; i++) {
+        // set diagonal
+        A[0][x1 * (i + n * i)] =
+        A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX;
+        E[0][x1 * (i + n * i)] =
+        E[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX;
+        // clear first subdiagonal
+        A[0][x1 * (i + 1 + n * i)] =
+        A[1][x1 * (i + 1 + n * i)] =
+        B[0][x1 * (i + 1 + n * i)] =
+        B[1][x1 * (i + 1 + n * i)] =
+        A[0][x1 * (i + 1 + n * i) + x1 - 1] =
+        A[1][x1 * (i + 1 + n * i) + x1 - 1] =
+        B[0][x1 * (i + 1 + n * i) + x1 - 1] =
+        B[1][x1 * (i + 1 + n * i) + x1 - 1] = 0;
+    }
+}
+
+
+void post() {
+    if (scale[0] != 1 || scale[0] != 1)
+        printf("scale[RELAPACK] = %12g\tscale[LAPACK] = %12g\n", scale[0], scale[1]);
+    if (scale[0]) {
+        xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, C[0], &n, &info);
+        xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, F[0], &n, &info);
+    }
+    error = x2vecerr(n * n, C[0], C[1]) + x2vecerr(n * n, F[0], F[1]);
+}
+
+void tests() {
+    lWork = 2 * n * n;
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+    B[0] = xmalloc(n * n);
+    B[1] = xmalloc(n * n);
+    C[0] = xmalloc(n * n);
+    C[1] = xmalloc(n * n);
+    D[0] = xmalloc(n * n);
+    D[1] = xmalloc(n * n);
+    E[0] = xmalloc(n * n);
+    E[1] = xmalloc(n * n);
+    F[0] = xmalloc(n * n);
+    F[1] = xmalloc(n * n);
+    Work = xmalloc(lWork);
+    iWork = imalloc(n + n + 2);
+
+    #define ROUTINE XPREF(tgsyl)
+
+    TEST("N", iZERO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+    TEST("N", iZERO, &n2, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+    TEST("N", iZERO, &n, &n2, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+    TEST("N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+    TEST("N", iTWO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+    TEST("N", iTHREE, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+    TEST("N", iFOUR, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+    TEST(xCTRANS, iZERO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+
+    free(A[0]);
+    free(A[1]);
+    free(B[0]);
+    free(B[1]);
+    free(C[0]);
+    free(C[1]);
+    free(D[0]);
+    free(D[1]);
+    free(E[0]);
+    free(E[1]);
+    free(F[0]);
+    free(F[1]);
+    free(Work);
+    free(iWork);
+}
diff --git a/relapack/test/xtrsyl.c b/relapack/test/xtrsyl.c
new file mode 100644
index 000000000..358a89242
--- /dev/null
+++ b/relapack/test/xtrsyl.c
@@ -0,0 +1,65 @@
+#include "test.h"
+
+datatype *A[2], *B[2], *C[2], *Work, scale[2];
+int info;
+
+#define xlascl XPREF(LAPACK(lascl))
+void xlascl(const char *, const int *, const int *, const datatype *, const
+        datatype *, const int *, const int *, datatype *, const int *, int *);
+
+void pre() {
+    int i;
+
+    x2matgen(n, n, A[0], A[1]);
+    x2matgen(n, n, B[0], B[1]);
+    x2matgen(n, n, C[0], C[1]);
+
+    for (i = 0; i < n; i++) {
+        // set diagonal
+        A[0][x1 * (i + n * i)] =
+        A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX;
+        // clear first subdiagonal
+        A[0][x1 * (i + 1 + n * i)] =
+        A[1][x1 * (i + 1 + n * i)] =
+        B[0][x1 * (i + 1 + n * i)] =
+        B[1][x1 * (i + 1 + n * i)] =
+        A[0][x1 * (i + 1 + n * i) + x1 - 1] =
+        A[1][x1 * (i + 1 + n * i) + x1 - 1] =
+        B[0][x1 * (i + 1 + n * i) + x1 - 1] =
+        B[1][x1 * (i + 1 + n * i) + x1 - 1] = 0;
+    }
+}
+
+void post() {
+    if (scale[0] != 1 || scale[0] != 1)
+        printf("scale[RELAPACK] = %12g\tscale[LAPACK] = %12g\n", scale[0], scale[1]);
+    if (scale[0])
+        xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, C[0], &n, &info);
+    error = x2vecerr(n * n, C[0], C[1]);
+}
+
+void tests() {
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+    B[0] = xmalloc(n * n);
+    B[1] = xmalloc(n * n);
+    C[0] = xmalloc(n * n);
+    C[1] = xmalloc(n * n);
+
+    #define ROUTINE XPREF(trsyl)
+
+    TEST("N", "N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+    TEST("N", "N", iONE, &n2, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+    TEST("N", "N", iONE, &n, &n2, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+    TEST("C", "N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+    TEST("N", "C", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+    TEST("C", "C", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+    TEST("N", "N", iMONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+
+    free(A[0]);
+    free(A[1]);
+    free(B[0]);
+    free(B[1]);
+    free(C[0]);
+    free(C[1]);
+}
diff --git a/relapack/test/xtrtri.c b/relapack/test/xtrtri.c
new file mode 100644
index 000000000..106391bc8
--- /dev/null
+++ b/relapack/test/xtrtri.c
@@ -0,0 +1,25 @@
+#include "test.h"
+
+datatype *A[2];
+int info;
+
+void pre() {
+    x2matgen(n, n, A[0], A[1]);
+}
+
+void post() {
+    error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+    A[0] = xmalloc(n * n);
+    A[1] = xmalloc(n * n);
+
+    #define ROUTINE XPREF(trtri)
+
+    TEST("L", "N", &n, A[i], &n, &info);
+    TEST("U", "N", &n, A[i], &n, &info);
+
+    free(A[0]);
+    free(A[1]);
+}

From 912410f2140da59c0b639110fd6ac6ed495e8bec Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 28 Jun 2017 18:15:21 +0200
Subject: [PATCH 13/42] Add ReLAPACK to Makefiles

---
 Makefile      | 20 ++++++++++++++------
 Makefile.rule |  5 ++++-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 27923aa9b..499732d16 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,11 @@ ifneq ($(NO_LAPACK), 1)
 SUBDIRS	+= lapack
 endif
 
+RELA =
+ifneq ($(BUILD_RELAPACK), 0)
+RELA = re_lapack
+endif
+
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
 
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
@@ -23,7 +28,7 @@ SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
 .PHONY : all libs netlib test ctest shared install
 .NOTPARALLEL : all libs prof lapack-test install blas-test
 
-all :: libs netlib tests shared
+all :: libs netlib $(RELA) tests shared
 	@echo
 	@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
 	@echo
@@ -215,6 +220,14 @@ ifndef NO_LAPACKE
 endif
 endif
 
+ifeq ($(NO_LAPACK), 1)
+re_lapack :
+
+else
+re_lapack :
+	@$(MAKE) -C relapack
+endif
+
 prof_lapack : lapack_prebuild
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
 
@@ -329,8 +342,3 @@ endif
 	@rm -f *.grd Makefile.conf_last config_last.h
 	@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
 	@echo Done.
-
-# Makefile debugging trick:
-# call print-VARIABLE to see the runtime value of any variable
-print-%:
-	@echo '$*=$($*)'
diff --git a/Makefile.rule b/Makefile.rule
index b6c22f798..2866699be 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -83,6 +83,9 @@ VERSION = 0.2.20.dev
 # Build LAPACK Deprecated functions since LAPACK 3.6.0
 BUILD_LAPACK_DEPRECATED = 1
 
+# Build RecursiveLAPACK on top of LAPACK
+BUILD_RELAPACK = 1
+
 # If you want to use legacy threaded Level 3 implementation.
 # USE_SIMPLE_THREADED_LEVEL3 = 1
 
@@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1
 NO_WARMUP = 1
 
 # If you want to disable CPU/Memory affinity on Linux.
-NO_AFFINITY = 1
+#NO_AFFINITY = 1
 
 # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
 # BIGNUMA = 1

From 0d5c8e53861adbb54ee9952ed76c2e8dd029ca96 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Fri, 30 Jun 2017 12:43:13 +0530
Subject: [PATCH 14/42] arm: Determine the abi from compiler if not specified
 on command line

If ARM abi is not explicitly mentioned on the command line, then set the
arm abi to softfp or hard according to the compiler environment.
This assumes that compiler sets the defines __ARM_PCS and __ARM_PCS_VFP
accordingly.
---
 Makefile.arm    | 23 +++--------------------
 Makefile.system | 24 ++++++++++++++++--------
 c_check         | 16 +++++++++++++++-
 common_arm.h    |  5 -----
 4 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/Makefile.arm b/Makefile.arm
index c189b0c47..eedd39b73 100644
--- a/Makefile.arm
+++ b/Makefile.arm
@@ -1,5 +1,4 @@
-#ifeq logical or
-ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
+ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
 ifeq ($(OSNAME), Android)
 CCOMMON_OPT += -mfpu=neon  -march=armv7-a
 FCOMMON_OPT += -mfpu=neon  -march=armv7-a
@@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
 endif
 endif
 
-ifeq ($(CORE), ARMV7)
-ifeq ($(OSNAME), Android)
-ifeq ($(ARM_SOFTFP_ABI), 1)
-CCOMMON_OPT += -mfpu=neon -march=armv7-a
-FCOMMON_OPT += -mfpu=neon -march=armv7-a
-else
-CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
-FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
-endif
-else
-CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
-FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
-endif
-endif
-
 ifeq ($(CORE), ARMV6)
 CCOMMON_OPT += -mfpu=vfp -march=armv6
 FCOMMON_OPT += -mfpu=vfp -march=armv6
 endif
 
-
 ifeq ($(CORE), ARMV5)
-CCOMMON_OPT += -marm -march=armv5
-FCOMMON_OPT += -marm -march=armv5
+CCOMMON_OPT += -march=armv5
+FCOMMON_OPT += -march=armv5
 endif
diff --git a/Makefile.system b/Makefile.system
index 29d3efd53..2cae5f1c9 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -242,6 +242,10 @@ EXTRALIB	+= -lm
 NO_EXPRECISION = 1
 endif
 
+ifeq ($(OSNAME), Android)
+EXTRALIB	+= -lm
+endif
+
 ifeq ($(OSNAME), AIX)
 EXTRALIB	+= -lm
 endif
@@ -483,16 +487,20 @@ ifeq ($(ARCH), arm)
 NO_BINARY_MODE  = 1
 BINARY_DEFINED  = 1
 
-CCOMMON_OPT += -marm
-FCOMMON_OPT += -marm
-
-ifeq ($(ARM_SOFTFP_ABI), 1)
-CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
-FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
+# If ABI is specified on command line use it. Else use the automatically detected ABI.
+ifeq ($(ARM_SOFTFP_ABI),1)
+ARM_ABI = softfp
 else
-CCOMMON_OPT += -mfloat-abi=hard
-FCOMMON_OPT += -mfloat-abi=hard
+ifeq ($(ARM_HARD_ABI),1)
+ARM_ABI = hard
+else
+ARM_ABI=$(ARM_ABI_AUTO)
 endif
+endif
+export ARM_ABI_AUTO
+CCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI)
+FCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI)
+
 endif
 
 ifeq ($(ARCH), arm64)
diff --git a/c_check b/c_check
index 20da288be..2e7e08cfb 100644
--- a/c_check
+++ b/c_check
@@ -94,7 +94,17 @@ if ($architecture eq "mips64") {
     $defined = 1;
 }
 
-if (($architecture eq "arm") || ($architecture eq "arm64")) {
+if ($architecture eq "arm") {
+    $defined = 1;
+    $data = `$compiler_name -dM -E ctest2.c | grep -w __ARM_PCS_VFP`;
+    if ($data ne "") {
+	$abi = "hard";
+    } else {
+	$abi = "softfp";
+    }
+}
+
+if ($architecture eq "arm64") {
     $defined = 1;
 }
 
@@ -287,6 +297,10 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
 print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
 print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
 
+if ($architecture eq "arm") {
+	print MAKEFILE "ARM_ABI_AUTO=$abi\n";
+}
+
 $os           =~ tr/[a-z]/[A-Z]/;
 $architecture =~ tr/[a-z]/[A-Z]/;
 $compiler     =~ tr/[a-z]/[A-Z]/;
diff --git a/common_arm.h b/common_arm.h
index a17acb448..27fa76b76 100644
--- a/common_arm.h
+++ b/common_arm.h
@@ -111,11 +111,6 @@ REALNAME:
 
 #define PROFCODE
 
-#ifdef __ARM_PCS
-//-mfloat-abi=softfp
-#define SOFT_FLOAT_ABI
-#endif
-
 #endif
 
 

From da7f0ff425a4ccd9a4f32b7fff33b9ef807ad0f4 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Fri, 30 Jun 2017 12:46:18 +0530
Subject: [PATCH 15/42] generic: add some generic gemm and trmm kernels

Added generic 4x4 and 4x2 gemm kernels
Added generic 4x2 trmm kernel
---
 kernel/generic/gemmkernel_4x2.c | 317 ++++++++++++++++++
 kernel/generic/gemmkernel_4x4.c | 571 ++++++++++++++++++++++++++++++++
 kernel/generic/trmmkernel_4x2.c | 528 +++++++++++++++++++++++++++++
 3 files changed, 1416 insertions(+)
 create mode 100644 kernel/generic/gemmkernel_4x2.c
 create mode 100644 kernel/generic/gemmkernel_4x4.c
 create mode 100644 kernel/generic/trmmkernel_4x2.c

diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c
new file mode 100644
index 000000000..1d15de1d7
--- /dev/null
+++ b/kernel/generic/gemmkernel_4x2.c
@@ -0,0 +1,317 @@
+/***************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <stdbool.h>
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+
+   for (j=0; j<(bn/2); j+=2)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1)
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] += res0_0;
+		C0[1] += res0_1;
+		C0[2] += res0_2;
+		C0[3] += res0_3;
+
+		C1[0] += res1_0;
+		C1[1] += res1_1;
+		C1[2] += res1_2;
+		C1[3] += res1_3;
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+   {
+        C0 = C;
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x1 loop
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x1 loop
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C0 = C0+1;
+
+	}
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/generic/gemmkernel_4x4.c b/kernel/generic/gemmkernel_4x4.c
new file mode 100644
index 000000000..bd67b3fc8
--- /dev/null
+++ b/kernel/generic/gemmkernel_4x4.c
@@ -0,0 +1,571 @@
+/***************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <stdbool.h>
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+
+
+   for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+	{
+
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x4 loop
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x4 loop
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		res2_0 *= alpha;
+
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C2[0] = res2_0;
+
+		C3[0] = res3_0;
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+   for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x2 loop
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x2 loop
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+   {
+        C0 = C;
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x1 loop
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x1 loop
+	{
+		ptrbb = bb;
+
+		res0_0 = 0;
+
+		for (k=0; k<bk; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C0 = C0+1;
+
+	}
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/generic/trmmkernel_4x2.c b/kernel/generic/trmmkernel_4x2.c
new file mode 100644
index 000000000..6f8847e29
--- /dev/null
+++ b/kernel/generic/trmmkernel_4x2.c
@@ -0,0 +1,528 @@
+#include "common.h"
+#include <stdbool.h>
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+
+   BLASLONG off, temp;
+
+   bool left;
+   bool transposed;
+   bool backwards;
+
+#ifdef LEFT
+   left = true;
+#else
+   left = false;
+#endif
+
+#ifdef TRANSA
+   transposed = true;
+#else
+   transposed = false;
+#endif
+
+   backwards = left != transposed;
+
+   if (!left) {
+      off = -offset;
+   }
+
+   for (j=0; j<(bn/2); j+=2) // do the Mx2 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+		off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x2 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x2 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x1 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x1 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}

From 89924b3d5b5cb31113545accb5ee2ecc8b1e53ac Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Fri, 30 Jun 2017 13:06:38 +0530
Subject: [PATCH 16/42] arm: Use assembly implementations based on the ARM abi

In case of softfp abi, assembly implementations of only those APIs are
used which doesnt have a floating point argument or return value.

In case of hard abi, all assembly implementations are used.
---
 kernel/arm/KERNEL.ARMV6 | 112 +++++++++++++++--------------------
 kernel/arm/KERNEL.ARMV7 | 125 +++++-----------------------------------
 2 files changed, 60 insertions(+), 177 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 16bde105b..a2dd4806d 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -1,7 +1,5 @@
+include $(KERNELDIR)/KERNEL.ARMV5
 
-
-
-###############################################################################
 SAMAXKERNEL  = iamax_vfp.S
 DAMAXKERNEL  = iamax_vfp.S
 CAMAXKERNEL  = iamax_vfp.S
@@ -34,6 +32,45 @@ IDMAXKERNEL  = iamax_vfp.S
 ISMINKERNEL  = iamax_vfp.S
 IDMINKERNEL  = iamax_vfp.S
 
+SGEMMKERNEL    = ../generic/gemmkernel_4x2.c
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
+SGEMMITCOPY    = sgemm_tcopy_4_vfp.S
+SGEMMINCOPYOBJ = sgemm_incopy.o
+SGEMMITCOPYOBJ = sgemm_itcopy.o
+endif
+SGEMMONCOPY    = sgemm_ncopy_2_vfp.S
+SGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o
+
+DGEMMKERNEL    = ../generic/gemmkernel_4x2.c
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    = dgemm_ncopy_4_vfp.S
+DGEMMITCOPY    = dgemm_tcopy_4_vfp.S
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+endif
+DGEMMONCOPY    = dgemm_ncopy_2_vfp.S
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+STRMMKERNEL    = ../generic/trmmkernel_4x2.c
+DTRMMKERNEL    = ../generic/trmmkernel_4x2.c
+
+CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
+CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
+CGEMMONCOPYOBJ = cgemm_oncopy.o
+CGEMMOTCOPYOBJ = cgemm_otcopy.o
+
+ZGEMMONCOPY    = zgemm_ncopy_2_vfp.S
+ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
+ZGEMMONCOPYOBJ = zgemm_oncopy.o
+ZGEMMOTCOPYOBJ = zgemm_otcopy.o
+
+ifeq ($(ARM_ABI),hard)
+
 SASUMKERNEL  = asum_vfp.S
 DASUMKERNEL  = asum_vfp.S
 CASUMKERNEL  = asum_vfp.S
@@ -44,11 +81,6 @@ DAXPYKERNEL  = axpy_vfp.S
 CAXPYKERNEL  = axpy_vfp.S
 ZAXPYKERNEL  = axpy_vfp.S
 
-SCOPYKERNEL  = copy.c
-DCOPYKERNEL  = copy.c
-CCOPYKERNEL  = zcopy.c
-ZCOPYKERNEL  = zcopy.c
-
 SDOTKERNEL   = sdot_vfp.S
 DDOTKERNEL   = ddot_vfp.S
 CDOTKERNEL   = cdot_vfp.S
@@ -64,11 +96,6 @@ DROTKERNEL   = rot_vfp.S
 CROTKERNEL   = rot_vfp.S
 ZROTKERNEL   = rot_vfp.S
 
-SSCALKERNEL  = scal.c
-DSCALKERNEL  = scal.c
-CSCALKERNEL  = zscal.c
-ZSCALKERNEL  = zscal.c
-
 SSWAPKERNEL  = swap_vfp.S
 DSWAPKERNEL  = swap_vfp.S
 CSWAPKERNEL  = swap_vfp.S
@@ -84,63 +111,14 @@ DGEMVTKERNEL = gemv_t_vfp.S
 CGEMVTKERNEL = cgemv_t_vfp.S
 ZGEMVTKERNEL = zgemv_t_vfp.S
 
-STRMMKERNEL	= strmm_kernel_4x2_vfp.S
-DTRMMKERNEL	= dtrmm_kernel_4x2_vfp.S
-CTRMMKERNEL	= ctrmm_kernel_2x2_vfp.S
-ZTRMMKERNEL	= ztrmm_kernel_2x2_vfp.S
+STRMMKERNEL    = strmm_kernel_4x2_vfp.S
+DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
+CTRMMKERNEL    = ctrmm_kernel_2x2_vfp.S
+ZTRMMKERNEL    = ztrmm_kernel_2x2_vfp.S
 
 SGEMMKERNEL    = sgemm_kernel_4x2_vfp.S
-SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
-SGEMMITCOPY    = sgemm_tcopy_4_vfp.S
-SGEMMINCOPYOBJ = sgemm_incopy.o
-SGEMMITCOPYOBJ = sgemm_itcopy.o
-SGEMMONCOPY    = sgemm_ncopy_2_vfp.S
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
-
 DGEMMKERNEL    = dgemm_kernel_4x2_vfp.S
-DGEMMINCOPY    = dgemm_ncopy_4_vfp.S
-DGEMMITCOPY    = dgemm_tcopy_4_vfp.S
-DGEMMINCOPYOBJ = dgemm_incopy.o
-DGEMMITCOPYOBJ = dgemm_itcopy.o
-DGEMMONCOPY    = dgemm_ncopy_2_vfp.S
-DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
-
 CGEMMKERNEL    = cgemm_kernel_2x2_vfp.S
-CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
-CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
-CGEMMONCOPYOBJ = cgemm_oncopy.o
-CGEMMOTCOPYOBJ = cgemm_otcopy.o
-
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfp.S
-ZGEMMONCOPY    = zgemm_ncopy_2_vfp.S
-ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
-ZGEMMONCOPYOBJ = zgemm_oncopy.o
-ZGEMMOTCOPYOBJ = zgemm_otcopy.o
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-
-
 
+endif
diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7
index d5cd94fbd..d4829faa3 100644
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -1,86 +1,29 @@
+include $(KERNELDIR)/KERNEL.ARMV6
 
-#################################################################################
-SAMAXKERNEL  = iamax_vfp.S
-DAMAXKERNEL  = iamax_vfp.S
-CAMAXKERNEL  = iamax_vfp.S
-ZAMAXKERNEL  = iamax_vfp.S
+STRMMKERNEL    = ../generic/trmmkernel_4x4.c
+DTRMMKERNEL    = ../generic/trmmkernel_4x4.c
 
-SAMINKERNEL  = iamax_vfp.S
-DAMINKERNEL  = iamax_vfp.S
-CAMINKERNEL  = iamax_vfp.S
-ZAMINKERNEL  = iamax_vfp.S
+SGEMMKERNEL    = ../generic/gemmkernel_4x4.c
+SGEMMONCOPY    =  sgemm_ncopy_4_vfp.S
+SGEMMOTCOPY    =  sgemm_tcopy_4_vfp.S
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 
-SMAXKERNEL   = iamax_vfp.S
-DMAXKERNEL   = iamax_vfp.S
+DGEMMKERNEL    = ../generic/gemmkernel_4x4.c
+DGEMMONCOPY    =  dgemm_ncopy_4_vfp.S
+DGEMMOTCOPY    =  dgemm_tcopy_4_vfp.S
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
-SMINKERNEL   = iamax_vfp.S
-DMINKERNEL   = iamax_vfp.S
-
-ISAMAXKERNEL = iamax_vfp.S
-IDAMAXKERNEL = iamax_vfp.S
-ICAMAXKERNEL = iamax_vfp.S
-IZAMAXKERNEL = iamax_vfp.S
-
-ISAMINKERNEL = iamax_vfp.S
-IDAMINKERNEL = iamax_vfp.S
-ICAMINKERNEL = iamax_vfp.S
-IZAMINKERNEL = iamax_vfp.S
-
-ISMAXKERNEL  = iamax_vfp.S
-IDMAXKERNEL  = iamax_vfp.S
-
-ISMINKERNEL  = iamax_vfp.S
-IDMINKERNEL  = iamax_vfp.S
-
-SSWAPKERNEL  = swap_vfp.S
-DSWAPKERNEL  = swap_vfp.S
-CSWAPKERNEL  = swap_vfp.S
-ZSWAPKERNEL  = swap_vfp.S
-
-SASUMKERNEL  = asum_vfp.S
-DASUMKERNEL  = asum_vfp.S
-CASUMKERNEL  = asum_vfp.S
-ZASUMKERNEL  = asum_vfp.S
-
-SAXPYKERNEL  = axpy_vfp.S
-DAXPYKERNEL  = axpy_vfp.S
-CAXPYKERNEL  = axpy_vfp.S
-ZAXPYKERNEL  = axpy_vfp.S
-
-SCOPYKERNEL  = copy.c
-DCOPYKERNEL  = copy.c
-CCOPYKERNEL  = zcopy.c
-ZCOPYKERNEL  = zcopy.c
-
-SDOTKERNEL   = sdot_vfp.S
-DDOTKERNEL   = ddot_vfp.S
-CDOTKERNEL   = cdot_vfp.S
-ZDOTKERNEL   = zdot_vfp.S
+ifeq ($(ARM_ABI),hard)
 
 SNRM2KERNEL  = nrm2_vfpv3.S
 DNRM2KERNEL  = nrm2_vfpv3.S
 CNRM2KERNEL  = nrm2_vfpv3.S
 ZNRM2KERNEL  = nrm2_vfpv3.S
 
-SROTKERNEL   = rot_vfp.S
-DROTKERNEL   = rot_vfp.S
-CROTKERNEL   = rot_vfp.S
-ZROTKERNEL   = rot_vfp.S
-
-SSCALKERNEL  = scal.c
-DSCALKERNEL  = scal.c
-CSCALKERNEL  = zscal.c
-ZSCALKERNEL  = zscal.c
-
 SGEMVNKERNEL = gemv_n_vfpv3.S
 DGEMVNKERNEL = gemv_n_vfpv3.S
-CGEMVNKERNEL = cgemv_n_vfp.S
-ZGEMVNKERNEL = zgemv_n_vfp.S
-
-SGEMVTKERNEL = gemv_t_vfp.S
-DGEMVTKERNEL = gemv_t_vfp.S
-CGEMVTKERNEL = cgemv_t_vfp.S
-ZGEMVTKERNEL = zgemv_t_vfp.S
 
 STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
 DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
@@ -88,47 +31,9 @@ CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
 ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S
 
 SGEMMKERNEL    =  sgemm_kernel_4x4_vfpv3.S
-SGEMMONCOPY    =  sgemm_ncopy_4_vfp.S
-SGEMMOTCOPY    =  sgemm_tcopy_4_vfp.S
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
-
 DGEMMKERNEL    =  dgemm_kernel_4x4_vfpv3.S
-DGEMMONCOPY    =  dgemm_ncopy_4_vfp.S
-DGEMMOTCOPY    =  dgemm_tcopy_4_vfp.S
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
 CGEMMKERNEL    = cgemm_kernel_2x2_vfpv3.S
-CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
-CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
-CGEMMONCOPYOBJ = cgemm_oncopy.o
-CGEMMOTCOPYOBJ = cgemm_otcopy.o
-
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfpv3.S
-ZGEMMONCOPY    = zgemm_ncopy_2_vfp.S
-ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
-ZGEMMONCOPYOBJ = zgemm_oncopy.o
-ZGEMMOTCOPYOBJ = zgemm_otcopy.o
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
 
+endif

From aa5edebc80200a590362dc2229e2751d399c04aa Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Fri, 30 Jun 2017 13:12:05 +0530
Subject: [PATCH 17/42] arm: add softfp support in kernel/arm/asum_vfp.S

---
 kernel/arm/KERNEL.ARMV6 | 10 +++++-----
 kernel/arm/asum_vfp.S   |  8 ++++++++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index a2dd4806d..1da6e4e1f 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -32,6 +32,11 @@ IDMAXKERNEL  = iamax_vfp.S
 ISMINKERNEL  = iamax_vfp.S
 IDMINKERNEL  = iamax_vfp.S
 
+SASUMKERNEL  = asum_vfp.S
+DASUMKERNEL  = asum_vfp.S
+CASUMKERNEL  = asum_vfp.S
+ZASUMKERNEL  = asum_vfp.S
+
 SGEMMKERNEL    = ../generic/gemmkernel_4x2.c
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
@@ -71,11 +76,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-SASUMKERNEL  = asum_vfp.S
-DASUMKERNEL  = asum_vfp.S
-CASUMKERNEL  = asum_vfp.S
-ZASUMKERNEL  = asum_vfp.S
-
 SAXPYKERNEL  = axpy_vfp.S
 DAXPYKERNEL  = axpy_vfp.S
 CAXPYKERNEL  = axpy_vfp.S
diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S
index fe6242a5b..5b08e5028 100644
--- a/kernel/arm/asum_vfp.S
+++ b/kernel/arm/asum_vfp.S
@@ -475,6 +475,14 @@ asum_kernel_L999:
 	vadd.f32	s0 , s0, s1				// set return value
 #endif
 
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	r0, s0
+#else
+	vmov	r0, r1, d0
+#endif
+#endif
+
 	bx	lr
 
 	EPILOGUE

From 4f0773f07d07b9adad103e66d7b3abae108d9d31 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Fri, 30 Jun 2017 20:06:29 +0530
Subject: [PATCH 18/42] arm: add softfp support in kernel/arm/axpy_vfp.S

---
 kernel/arm/KERNEL.ARMV6 | 10 +++---
 kernel/arm/axpy_vfp.S   | 71 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 1da6e4e1f..63867d2b7 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -37,6 +37,11 @@ DASUMKERNEL  = asum_vfp.S
 CASUMKERNEL  = asum_vfp.S
 ZASUMKERNEL  = asum_vfp.S
 
+SAXPYKERNEL  = axpy_vfp.S
+DAXPYKERNEL  = axpy_vfp.S
+CAXPYKERNEL  = axpy_vfp.S
+ZAXPYKERNEL  = axpy_vfp.S
+
 SGEMMKERNEL    = ../generic/gemmkernel_4x2.c
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
@@ -76,11 +81,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-SAXPYKERNEL  = axpy_vfp.S
-DAXPYKERNEL  = axpy_vfp.S
-CAXPYKERNEL  = axpy_vfp.S
-ZAXPYKERNEL  = axpy_vfp.S
-
 SDOTKERNEL   = sdot_vfp.S
 DDOTKERNEL   = ddot_vfp.S
 CDOTKERNEL   = cdot_vfp.S
diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S
index 8e5334f62..a407b04bd 100644
--- a/kernel/arm/axpy_vfp.S
+++ b/kernel/arm/axpy_vfp.S
@@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define STACKSIZE 256
 
-#ifndef ARM_SOFTFP_ABI
-//hard	
-#define	OLD_INC_X	[fp, #0 ]
-#define	OLD_Y		[fp, #4 ]
-#define	OLD_INC_Y	[fp, #8 ]
-#else
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(COMPLEX)
+
+#if !defined(DOUBLE)
+#define	OLD_ALPHA	r3
 #define	OLD_X		[fp, #0 ]
 #define	OLD_INC_X	[fp, #4 ]
 #define	OLD_Y		[fp, #8 ]
 #define	OLD_INC_Y	[fp, #12 ]
+#else
+#define	OLD_ALPHA	[fp, #0]
+#define	OLD_X		[fp, #8 ]
+#define	OLD_INC_X	[fp, #12 ]
+#define	OLD_Y		[fp, #16 ]
+#define	OLD_INC_Y	[fp, #20 ]
 #endif
-	
+
+#else //COMPLEX
+
+#if !defined(DOUBLE)
+#define	OLD_ALPHAR	r3
+#define	OLD_ALPHAI	[fp, #0 ]
+#define	OLD_X		[fp, #4 ]
+#define	OLD_INC_X	[fp, #8 ]
+#define	OLD_Y		[fp, #12 ]
+#define	OLD_INC_Y	[fp, #16 ]
+#else
+#define	OLD_ALPHAR	[fp, #0]
+#define	OLD_ALPHAI	[fp, #8]
+#define	OLD_X		[fp, #16 ]
+#define	OLD_INC_X	[fp, #20 ]
+#define	OLD_Y		[fp, #24 ]
+#define	OLD_INC_Y	[fp, #28 ]
+#endif
+
+#endif //!defined(COMPLEX)
+
+#else //__ARM_PCS_VFP
+
+#define	OLD_INC_X	[fp, #0 ]
+#define	OLD_Y		[fp, #4 ]
+#define	OLD_INC_Y	[fp, #8 ]
+
+#endif //!defined(__ARM_PCS_VFP)
+
 #define	N	r0
 #define Y	r1
 #define	INC_X	r2
@@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         add     fp, sp, #8
 	sub     sp, sp, #STACKSIZE                              // reserve stack
 
-#ifdef ARM_SOFTFP_ABI
-#ifndef DOUBLE
-	vmov	s0, r3     //move alpha to s0
+#if !defined(__ARM_PCS_VFP)
+#if !defined(COMPLEX)
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHA
+	ldr	X, OLD_X
+#else
+	vldr	d0, OLD_ALPHA
 	ldr	X, OLD_X
 #endif
+#else //COMPLEX
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHAR
+	vldr	s1, OLD_ALPHAI
+	ldr	X, OLD_X
+#else
+	vldr	d0, OLD_ALPHAR
+	vldr	d1, OLD_ALPHAI
+	ldr     X, OLD_X
 #endif
-	
+#endif
+#endif
+
 	ldr    INC_X , OLD_INC_X
 	ldr         Y, OLD_Y
 	ldr    INC_Y , OLD_INC_Y

From 0150fabdb6250748bc45d18ccbb782331526c5cd Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Fri, 30 Jun 2017 21:52:32 +0530
Subject: [PATCH 19/42] arm: add softfp support in kernel/arm/rot_vfp.S

---
 kernel/arm/KERNEL.ARMV6 | 10 +++++-----
 kernel/arm/rot_vfp.S    | 19 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 63867d2b7..e9fe6bedd 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -42,6 +42,11 @@ DAXPYKERNEL  = axpy_vfp.S
 CAXPYKERNEL  = axpy_vfp.S
 ZAXPYKERNEL  = axpy_vfp.S
 
+SROTKERNEL   = rot_vfp.S
+DROTKERNEL   = rot_vfp.S
+CROTKERNEL   = rot_vfp.S
+ZROTKERNEL   = rot_vfp.S
+
 SGEMMKERNEL    = ../generic/gemmkernel_4x2.c
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
@@ -91,11 +96,6 @@ DNRM2KERNEL  = nrm2_vfp.S
 CNRM2KERNEL  = nrm2_vfp.S
 ZNRM2KERNEL  = nrm2_vfp.S
 
-SROTKERNEL   = rot_vfp.S
-DROTKERNEL   = rot_vfp.S
-CROTKERNEL   = rot_vfp.S
-ZROTKERNEL   = rot_vfp.S
-
 SSWAPKERNEL  = swap_vfp.S
 DSWAPKERNEL  = swap_vfp.S
 CSWAPKERNEL  = swap_vfp.S
diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S
index d053423b6..6e679ecf9 100644
--- a/kernel/arm/rot_vfp.S
+++ b/kernel/arm/rot_vfp.S
@@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define	OLD_INC_Y	[fp, #0 ]
 
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+#define	OLD_C		[fp, #4]
+#define	OLD_S		[fp, #8]
+#else
+#define	OLD_C		[fp, #8]
+#define	OLD_S		[fp, #16]
+#endif
+#endif
 
 #define	N	r0
 #define X	r1
@@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         add     fp, sp, #8
 
 	ldr    INC_Y , OLD_INC_Y
-
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vldr	s0, OLD_C
+	vldr	s1, OLD_S
+#else
+	vldr	d0, OLD_C
+	vldr	d1, OLD_S
+#endif
+#endif
 
 	cmp	N, #0
 	ble	rot_kernel_L999

From 54915ce343dba7e00dba21c73b9fd35bcded0de3 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Fri, 30 Jun 2017 23:46:02 +0530
Subject: [PATCH 20/42] arm: add softfp support in kernel/arm/*dot_vfp.S

---
 kernel/arm/KERNEL.ARMV6 | 10 +++++-----
 kernel/arm/cdot_vfp.S   | 32 ++++++++++++++++++++++++++------
 kernel/arm/ddot_vfp.S   |  3 +++
 kernel/arm/sdot_vfp.S   | 13 ++++++-------
 kernel/arm/zdot_vfp.S   | 32 ++++++++++++++++++++++++++------
 5 files changed, 66 insertions(+), 24 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index e9fe6bedd..022547c9b 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -47,6 +47,11 @@ DROTKERNEL   = rot_vfp.S
 CROTKERNEL   = rot_vfp.S
 ZROTKERNEL   = rot_vfp.S
 
+SDOTKERNEL   = sdot_vfp.S
+DDOTKERNEL   = ddot_vfp.S
+CDOTKERNEL   = cdot_vfp.S
+ZDOTKERNEL   = zdot_vfp.S
+
 SGEMMKERNEL    = ../generic/gemmkernel_4x2.c
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
@@ -86,11 +91,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-SDOTKERNEL   = sdot_vfp.S
-DDOTKERNEL   = ddot_vfp.S
-CDOTKERNEL   = cdot_vfp.S
-ZDOTKERNEL   = zdot_vfp.S
-
 SNRM2KERNEL  = nrm2_vfp.S
 DNRM2KERNEL  = nrm2_vfp.S
 CNRM2KERNEL  = nrm2_vfp.S
diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S
index 0497b6d83..e5a6e4d35 100644
--- a/kernel/arm/cdot_vfp.S
+++ b/kernel/arm/cdot_vfp.S
@@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
-#define	OLD_Y	r3
-
 
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
@@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * registers
 *******************************************************/
 
-#define OLD_INC_Y	[fp, #4 ]
+#if !defined(__ARM_PCS_VFP)
+#define	OLD_RETURN_ADDR	r0
+#define	OLD_N		r1
+#define	OLD_X		r2
+#define	OLD_INC_X	r3
+#define	OLD_Y		[fp, #0 ]
+#define	OLD_INC_Y	[fp, #4 ]
+#define	RETURN_ADDR	r8
+#else
+#define	OLD_Y		r3
+#define	OLD_INC_Y	[fp, #0 ]
+#endif
 
 #define I	r5
 #define Y	r6
@@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.align 5
 
 	push	{r4 - r9, fp}
-	add	fp, sp, #24
+	add	fp, sp, #28
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
 	sub	r4, fp, #128
@@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmov	s2, s0
 	vmov	s3, s0
 
+#if !defined(__ARM_PCS_VFP)
+	mov	RETURN_ADDR, OLD_RETURN_ADDR
+	mov	N, OLD_N
+	mov	X, OLD_X
+	mov	INC_X, OLD_INC_X
+	ldr	Y, OLD_Y
+	ldr	INC_Y, OLD_INC_Y
+#else
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
+#endif
 
 	cmp	N, #0
 	ble	cdot_kernel_L999
@@ -265,7 +283,6 @@ cdot_kernel_S10:
 
 
 cdot_kernel_L999:
-
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers
 
@@ -276,8 +293,11 @@ cdot_kernel_L999:
 	vadd.f32	s0 , s0, s2
 	vsub.f32	s1 , s1, s3
 #endif
+#if !defined(__ARM_PCS_VFP)
+	vstm	RETURN_ADDR, {s0 - s1}
+#endif
 
-	sub	sp, fp, #24
+	sub	sp, fp, #28
 	pop	{r4 - r9, fp}
 	bx	lr
 
diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S
index f28acbae3..fb294d8b4 100644
--- a/kernel/arm/ddot_vfp.S
+++ b/kernel/arm/ddot_vfp.S
@@ -246,6 +246,9 @@ ddot_kernel_L999:
 	vldm	r3, { d8 - d15}					// restore floating point registers
 
 	vadd.f64	d0 , d0, d1				// set return value
+#if !defined(__ARM_PCS_VFP)
+	vmov	r0, r1, d0
+#endif
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S
index f3abdc197..5f4f424bf 100644
--- a/kernel/arm/sdot_vfp.S
+++ b/kernel/arm/sdot_vfp.S
@@ -329,20 +329,19 @@ sdot_kernel_L999:
 	vldm	r3, { s8 - s15}					// restore floating point registers
 
 #if	defined(DSDOT)
-
 	vadd.f64	d0 , d0, d1				// set return value
-
-#ifdef ARM_SOFTFP_ABI
-	vmov	r0, r1, d0
+#else
+	vadd.f32	s0 , s0, s1				// set return value
 #endif
 
+#if !defined(__ARM_PCS_VFP)
+#if defined(DSDOT)
+	vmov	r0, r1, d0
 #else
-
-	vadd.f32	s0 , s0, s1				// set return value
-#ifdef ARM_SOFTFP_ABI
 	vmov	r0, s0
 #endif
 #endif
+
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S
index 936ce9f60..43f2c0c0b 100644
--- a/kernel/arm/zdot_vfp.S
+++ b/kernel/arm/zdot_vfp.S
@@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
-#define	OLD_Y	r3
-
 
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
@@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * registers
 *******************************************************/
 
-#define OLD_INC_Y	[fp, #4 ]
+#if !defined(__ARM_PCS_VFP)
+#define	OLD_RETURN_ADDR	r0
+#define	OLD_N		r1
+#define	OLD_X		r2
+#define	OLD_INC_X	r3
+#define	OLD_Y		[fp, #0 ]
+#define	OLD_INC_Y	[fp, #4 ]
+#define	RETURN_ADDR	r8
+#else
+#define	OLD_Y		r3
+#define	OLD_INC_Y	[fp, #0 ]
+#endif
 
 #define I	r5
 #define Y	r6
@@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.align 5
 
 	push	{r4 - r9, fp}
-	add	fp, sp, #24
+	add	fp, sp, #28
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
 	sub	r4, fp, #128
@@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         vcvt.f64.f32    d2, s0
         vcvt.f64.f32    d3, s0
 
+#if !defined(__ARM_PCS_VFP)
+	mov	RETURN_ADDR, OLD_RETURN_ADDR
+	mov	N, OLD_N
+	mov	X, OLD_X
+	mov	INC_X, OLD_INC_X
+	ldr	Y, OLD_Y
+	ldr	INC_Y, OLD_INC_Y
+#else
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
-
+#endif
 
 	cmp	N, #0
 	ble	zdot_kernel_L999
@@ -280,8 +297,11 @@ zdot_kernel_L999:
 	vadd.f64	d0 , d0, d2
 	vsub.f64	d1 , d1, d3
 #endif
+#if !defined(__ARM_PCS_VFP)
+	vstm	RETURN_ADDR, {d0 - d1}
+#endif
 
-	sub	sp, fp, #24
+	sub	sp, fp, #28
 	pop	{r4 - r9, fp}
 	bx	lr
 

From e25f4c01d60b76ab97252e475abfb8fa7e65c0f9 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Sat, 1 Jul 2017 19:57:28 +0530
Subject: [PATCH 21/42] arm: add softfp support in kernel/arm/nrm2_vfp*.S

---
 kernel/arm/KERNEL.ARMV6 | 10 +++++-----
 kernel/arm/KERNEL.ARMV7 | 10 +++++-----
 kernel/arm/nrm2_vfp.S   |  7 +++++++
 kernel/arm/nrm2_vfpv3.S |  9 +++++++--
 4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 022547c9b..be51e83b8 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -52,6 +52,11 @@ DDOTKERNEL   = ddot_vfp.S
 CDOTKERNEL   = cdot_vfp.S
 ZDOTKERNEL   = zdot_vfp.S
 
+SNRM2KERNEL  = nrm2_vfp.S
+DNRM2KERNEL  = nrm2_vfp.S
+CNRM2KERNEL  = nrm2_vfp.S
+ZNRM2KERNEL  = nrm2_vfp.S
+
 SGEMMKERNEL    = ../generic/gemmkernel_4x2.c
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
@@ -91,11 +96,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-SNRM2KERNEL  = nrm2_vfp.S
-DNRM2KERNEL  = nrm2_vfp.S
-CNRM2KERNEL  = nrm2_vfp.S
-ZNRM2KERNEL  = nrm2_vfp.S
-
 SSWAPKERNEL  = swap_vfp.S
 DSWAPKERNEL  = swap_vfp.S
 CSWAPKERNEL  = swap_vfp.S
diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7
index d4829faa3..f4823b70a 100644
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -1,5 +1,10 @@
 include $(KERNELDIR)/KERNEL.ARMV6
 
+SNRM2KERNEL  = nrm2_vfpv3.S
+DNRM2KERNEL  = nrm2_vfpv3.S
+CNRM2KERNEL  = nrm2_vfpv3.S
+ZNRM2KERNEL  = nrm2_vfpv3.S
+
 STRMMKERNEL    = ../generic/trmmkernel_4x4.c
 DTRMMKERNEL    = ../generic/trmmkernel_4x4.c
 
@@ -17,11 +22,6 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-SNRM2KERNEL  = nrm2_vfpv3.S
-DNRM2KERNEL  = nrm2_vfpv3.S
-CNRM2KERNEL  = nrm2_vfpv3.S
-ZNRM2KERNEL  = nrm2_vfpv3.S
-
 SGEMVNKERNEL = gemv_n_vfpv3.S
 DGEMVNKERNEL = gemv_n_vfpv3.S
 
diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S
index b3bd28152..16ac5a632 100644
--- a/kernel/arm/nrm2_vfp.S
+++ b/kernel/arm/nrm2_vfp.S
@@ -573,6 +573,13 @@ nrm2_kernel_L999:
 #else
 	vsqrt.f32	s1, s1
 	vmul.f32	s0, s0, s1
+#endif
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	r0, s0
+#else
+	vmov	r0, r1, d0
+#endif
 #endif
 
 	bx	lr
diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S
index 7af966895..84977901d 100644
--- a/kernel/arm/nrm2_vfpv3.S
+++ b/kernel/arm/nrm2_vfpv3.S
@@ -503,8 +503,13 @@ nrm2_kernel_L999:
 #else
 	vsqrt.f32	s1, s1
 	vmul.f32	s0, s0, s1
-#ifdef ARM_SOFTFP_ABI
-	vmov		r0, s0
+#endif
+
+#if !defined(__ARM_PCS_VFP)
+#if defined(DOUBLE)
+	vmov	r0, r1, d0
+#else
+	vmov	r0, s0
 #endif
 #endif
 

From 83bd547517e0a7df5b60ae1e6165c9d3528a07e4 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Sat, 1 Jul 2017 20:37:40 +0530
Subject: [PATCH 22/42] arm: add softfp support in kernel/arm/swap_vfp.S

---
 kernel/arm/KERNEL.ARMV6 | 10 +++++-----
 kernel/arm/swap_vfp.S   | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index be51e83b8..86d3dabaa 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -57,6 +57,11 @@ DNRM2KERNEL  = nrm2_vfp.S
 CNRM2KERNEL  = nrm2_vfp.S
 ZNRM2KERNEL  = nrm2_vfp.S
 
+SSWAPKERNEL  = swap_vfp.S
+DSWAPKERNEL  = swap_vfp.S
+CSWAPKERNEL  = swap_vfp.S
+ZSWAPKERNEL  = swap_vfp.S
+
 SGEMMKERNEL    = ../generic/gemmkernel_4x2.c
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
@@ -96,11 +101,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-SSWAPKERNEL  = swap_vfp.S
-DSWAPKERNEL  = swap_vfp.S
-CSWAPKERNEL  = swap_vfp.S
-ZSWAPKERNEL  = swap_vfp.S
-
 SGEMVNKERNEL = gemv_n_vfp.S
 DGEMVNKERNEL = gemv_n_vfp.S
 CGEMVNKERNEL = cgemv_n_vfp.S
diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S
index 352875188..76661da79 100644
--- a/kernel/arm/swap_vfp.S
+++ b/kernel/arm/swap_vfp.S
@@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define STACKSIZE 256
 
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(COMPLEX)
+
+#if !defined(DOUBLE)
+#define OLD_X		[fp, #0 ]
+#define	OLD_INC_X	[fp, #4 ]
+#define	OLD_Y		[fp, #8 ]
+#define	OLD_INC_Y	[fp, #12 ]
+#else
+#define OLD_X		[fp, #8 ]
+#define	OLD_INC_X	[fp, #12]
+#define	OLD_Y		[fp, #16]
+#define	OLD_INC_Y	[fp, #20]
+#endif
+
+#else //COMPLEX
+
+#if !defined(DOUBLE)
+#define OLD_X		[fp, #4 ]
+#define	OLD_INC_X	[fp, #8 ]
+#define	OLD_Y		[fp, #12 ]
+#define	OLD_INC_Y	[fp, #16 ]
+#else
+#define OLD_X		[fp, #16]
+#define	OLD_INC_X	[fp, #20]
+#define	OLD_Y		[fp, #24]
+#define	OLD_INC_Y	[fp, #28]
+#endif
+
+#endif // !defined(__ARM_PCS_VFP)
+
+#else
 #define	OLD_INC_X	[fp, #0 ]
 #define	OLD_Y		[fp, #4 ]
 #define	OLD_INC_Y	[fp, #8 ]
+#endif
 
 
 #define	N	r0
@@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	push    {r4 , fp}
         add     fp, sp, #8
 
+#if !defined(__ARM_PCS_VFP)
+	ldr	X, OLD_X
+#endif
 	ldr    INC_X , OLD_INC_X
 	ldr         Y, OLD_Y
 	ldr    INC_Y , OLD_INC_Y

From ebf9e9dabe0c4f8208e8dd5c8a6579fa0045450e Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Date: Sat, 1 Jul 2017 11:16:12 -0700
Subject: [PATCH 23/42] arm64: Change mtune/mcpu options for THUNDERX2T99
 target

---
 Makefile.arm64 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.arm64 b/Makefile.arm64
index 7e9df2f4b..d19e796a5 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
 endif
 
 ifeq ($(CORE), THUNDERX2T99)
-CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
-FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
+CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
+FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
 endif

From 8f83d3f961f57fb002d8c5359c32a8db50dcab5d Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Sun, 2 Jul 2017 00:38:44 +0530
Subject: [PATCH 24/42] arm: add softfp support in vfp gemv kernels

---
 kernel/arm/KERNEL.ARMV6   | 20 +++++++-------
 kernel/arm/KERNEL.ARMV7   |  6 ++--
 kernel/arm/cgemv_n_vfp.S  | 28 +++++++++++++++----
 kernel/arm/cgemv_t_vfp.S  | 28 +++++++++++++++----
 kernel/arm/gemv_n_vfp.S   | 44 +++++++++++++++++++++++++----
 kernel/arm/gemv_n_vfpv3.S | 58 +++++++++++++++++++++++----------------
 kernel/arm/gemv_t_vfp.S   | 54 ++++++++++++++++++++++--------------
 kernel/arm/gemv_t_vfpv3.S | 44 +++++++++++++++++++++++++----
 kernel/arm/zgemv_n_vfp.S  | 28 +++++++++++++++----
 kernel/arm/zgemv_t_vfp.S  | 28 +++++++++++++++----
 10 files changed, 252 insertions(+), 86 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 86d3dabaa..022a93183 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -62,6 +62,16 @@ DSWAPKERNEL  = swap_vfp.S
 CSWAPKERNEL  = swap_vfp.S
 ZSWAPKERNEL  = swap_vfp.S
 
+SGEMVNKERNEL = gemv_n_vfp.S
+DGEMVNKERNEL = gemv_n_vfp.S
+CGEMVNKERNEL = cgemv_n_vfp.S
+ZGEMVNKERNEL = zgemv_n_vfp.S
+
+SGEMVTKERNEL = gemv_t_vfp.S
+DGEMVTKERNEL = gemv_t_vfp.S
+CGEMVTKERNEL = cgemv_t_vfp.S
+ZGEMVTKERNEL = zgemv_t_vfp.S
+
 SGEMMKERNEL    = ../generic/gemmkernel_4x2.c
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
@@ -101,16 +111,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-SGEMVNKERNEL = gemv_n_vfp.S
-DGEMVNKERNEL = gemv_n_vfp.S
-CGEMVNKERNEL = cgemv_n_vfp.S
-ZGEMVNKERNEL = zgemv_n_vfp.S
-
-SGEMVTKERNEL = gemv_t_vfp.S
-DGEMVTKERNEL = gemv_t_vfp.S
-CGEMVTKERNEL = cgemv_t_vfp.S
-ZGEMVTKERNEL = zgemv_t_vfp.S
-
 STRMMKERNEL    = strmm_kernel_4x2_vfp.S
 DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
 CTRMMKERNEL    = ctrmm_kernel_2x2_vfp.S
diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7
index f4823b70a..0872cb8cd 100644
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -5,6 +5,9 @@ DNRM2KERNEL  = nrm2_vfpv3.S
 CNRM2KERNEL  = nrm2_vfpv3.S
 ZNRM2KERNEL  = nrm2_vfpv3.S
 
+SGEMVNKERNEL = gemv_n_vfpv3.S
+DGEMVNKERNEL = gemv_n_vfpv3.S
+
 STRMMKERNEL    = ../generic/trmmkernel_4x4.c
 DTRMMKERNEL    = ../generic/trmmkernel_4x4.c
 
@@ -22,9 +25,6 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-SGEMVNKERNEL = gemv_n_vfpv3.S
-DGEMVNKERNEL = gemv_n_vfpv3.S
-
 STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
 DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
 CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S
index 5d2748644..4a1cd2d45 100644
--- a/kernel/arm/cgemv_n_vfp.S
+++ b/kernel/arm/cgemv_n_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define STACKSIZE 256
 
-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR	r3
+#define OLD_ALPHAI	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #4 ]
+#define OLD_LDA		[fp, #8 ]
+#define X		[fp, #12 ]
+#define OLD_INC_X	[fp, #16 ]
+#define Y		[fp, #20 ]
+#define OLD_INC_Y	[fp, #24 ]
+#else
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+#endif
+
 #define OLD_A		r3
 #define	OLD_M		r0
 
@@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	cgemvn_kernel_L999
 
+#if !defined(__ARM_PCS_VFP)
+	vmov	s0, OLD_ALPHAR
+	vldr	s1, OLD_ALPHAI
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_M, M
 	vstr    s0 , ALPHA_R
diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S
index 76c8a8f18..e1c750c85 100644
--- a/kernel/arm/cgemv_t_vfp.S
+++ b/kernel/arm/cgemv_t_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define STACKSIZE 256
 
-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR	r3
+#define OLD_ALPHAI	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #4 ]
+#define OLD_LDA		[fp, #8 ]
+#define X		[fp, #12 ]
+#define OLD_INC_X	[fp, #16 ]
+#define Y		[fp, #20 ]
+#define OLD_INC_Y	[fp, #24 ]
+#else
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+#endif
+
 #define OLD_A		r3
 #define	OLD_N		r1
 
@@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	cgemvt_kernel_L999
 
+#if !defined(__ARM_PCS_VFP)
+	vmov	s0, OLD_ALPHAR
+	vldr	s1, OLD_ALPHAI
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_N, N
 
diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S
index 385370b7f..7c154d741 100644
--- a/kernel/arm/gemv_n_vfp.S
+++ b/kernel/arm/gemv_n_vfp.S
@@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define STACKSIZE 256
 
-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
+#define OLD_ALPHA	r3
+#define OLD_A_SOFTFP	[fp, #0 ]
+#define OLD_LDA		[fp, #4 ]
+#define X		[fp, #8 ]
+#define OLD_INC_X	[fp, #12 ]
+#define Y		[fp, #16 ]
+#define OLD_INC_Y	[fp, #20 ]
+#else
+#define OLD_ALPHA	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define OLD_LDA		[fp, #12]
+#define X		[fp, #16]
+#define OLD_INC_X	[fp, #20]
+#define Y		[fp, #24]
+#define OLD_INC_Y	[fp, #28]
+#endif
+
+#else
+
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+
+#endif
+
 #define OLD_A		r3
 #define	OLD_M		r0
 
@@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	gemvn_kernel_L999
 
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHA
+#else
+	vldr	d0, OLD_ALPHA
+#endif
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_M, M
 
diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S
index 93bf23e49..54f958b7b 100644
--- a/kernel/arm/gemv_n_vfpv3.S
+++ b/kernel/arm/gemv_n_vfpv3.S
@@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define STACKSIZE 256
 
-#ifndef ARM_SOFTFP_ABI
-//hard
-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
-#define OLD_A		r3
-#else
-#define OLD_A_SOFTFP	[fp, #0 ]
-#define	OLD_LDA		[fp, #4 ]
-#define	X		[fp, #8 ]
-#define	OLD_INC_X	[fp, #12 ]
-#define	Y		[fp, #16 ]
-#define	OLD_INC_Y	[fp, #20 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
 #define OLD_ALPHA	r3
-#define OLD_A		r3
+#define OLD_A_SOFTFP	[fp, #0 ]
+#define OLD_LDA		[fp, #4 ]
+#define X		[fp, #8 ]
+#define OLD_INC_X	[fp, #12 ]
+#define Y		[fp, #16 ]
+#define OLD_INC_Y	[fp, #20 ]
+#else
+#define OLD_ALPHA	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define OLD_LDA		[fp, #12]
+#define X		[fp, #16]
+#define OLD_INC_X	[fp, #20]
+#define Y		[fp, #24]
+#define OLD_INC_Y	[fp, #28]
 #endif
 
+#else
+
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+
+#endif
+
+#define OLD_A		r3
 #define	OLD_M		r0
 
 #define AO1	r0
@@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	gemvn_kernel_L999
 
-#ifndef DOUBLE
-#ifdef ARM_SOFTFP_ABI
-
-	vmov	s0, 	OLD_ALPHA
-	ldr	OLD_A,	OLD_A_SOFTFP
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHA
+#else
+	vldr	d0, OLD_ALPHA
 #endif
+	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 
 	str	OLD_A, A
 	str	OLD_M, M
-	
-	
+
 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y
 
diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S
index 816be54ff..9559d1829 100644
--- a/kernel/arm/gemv_t_vfp.S
+++ b/kernel/arm/gemv_t_vfp.S
@@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define STACKSIZE 256
 
-#ifndef ARM_SOFTFP_ABI
-//hard abi	
-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
-#define OLD_A		r3
-#else
-#define OLD_A_SOFTFP	[fp, #0 ]
-#define	OLD_LDA		[fp, #4 ]
-#define	X		[fp, #8 ]
-#define	OLD_INC_X	[fp, #12 ]
-#define	Y		[fp, #16 ]
-#define	OLD_INC_Y	[fp, #20 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
 #define OLD_ALPHA	r3
-#define OLD_A		r3
+#define OLD_A_SOFTFP	[fp, #0 ]
+#define OLD_LDA		[fp, #4 ]
+#define X		[fp, #8 ]
+#define OLD_INC_X	[fp, #12 ]
+#define Y		[fp, #16 ]
+#define OLD_INC_Y	[fp, #20 ]
+#else
+#define OLD_ALPHA	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define OLD_LDA		[fp, #12]
+#define X		[fp, #16]
+#define OLD_INC_X	[fp, #20]
+#define Y		[fp, #24]
+#define OLD_INC_Y	[fp, #28]
 #endif
 
+#else
+
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+
+#endif
+
+#define OLD_A		r3
 #define	OLD_N		r1
 
 #define M	r0
@@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	gemvt_kernel_L999
 
-#ifndef DOUBLE
-#ifdef ARM_SOFTFP_ABI
-	vmov	s0,	OLD_ALPHA
-	ldr	OLD_A,	OLD_A_SOFTFP
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHA
+#else
+	vldr	d0, OLD_ALPHA
 #endif
+	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 
 	str	OLD_A, A
diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S
index 7ae5799bc..b1d3dadf1 100644
--- a/kernel/arm/gemv_t_vfpv3.S
+++ b/kernel/arm/gemv_t_vfpv3.S
@@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define STACKSIZE 256
 
-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
+#define OLD_ALPHA	r3
+#define OLD_A_SOFTFP	[fp, #0 ]
+#define OLD_LDA		[fp, #4 ]
+#define X		[fp, #8 ]
+#define OLD_INC_X	[fp, #12 ]
+#define Y		[fp, #16 ]
+#define OLD_INC_Y	[fp, #20 ]
+#else
+#define OLD_ALPHA	[fp, #0 ]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define OLD_LDA		[fp, #12]
+#define X		[fp, #16]
+#define OLD_INC_X	[fp, #20]
+#define Y		[fp, #24]
+#define OLD_INC_Y	[fp, #28]
+#endif
+
+#else
+
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+
+#endif
+
 #define OLD_A		r3
 #define	OLD_N		r1
 
@@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	gemvt_kernel_L999
 
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	s0, OLD_ALPHA
+#else
+	vldr	d0, OLD_ALPHA
+#endif
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_N, N
 
diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S
index da9a91043..7d5567849 100644
--- a/kernel/arm/zgemv_n_vfp.S
+++ b/kernel/arm/zgemv_n_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define STACKSIZE 256
 
-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR	[fp, #0 ]
+#define OLD_ALPHAI	[fp, #8 ]
+#define OLD_A_SOFTFP	[fp, #16]
+#define OLD_LDA		[fp, #20]
+#define X		[fp, #24]
+#define OLD_INC_X	[fp, #28]
+#define Y		[fp, #32]
+#define OLD_INC_Y	[fp, #36]
+#else
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+#endif
+
 #define OLD_A		r3
 #define	OLD_M		r0
 
@@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	zgemvn_kernel_L999
 
+#if !defined(__ARM_PCS_VFP)
+	vldr	d0, OLD_ALPHAR
+	vldr	d1, OLD_ALPHAI
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_M, M
 	vstr    d0 , ALPHA_R
diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S
index 211fa0701..407026166 100644
--- a/kernel/arm/zgemv_t_vfp.S
+++ b/kernel/arm/zgemv_t_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define STACKSIZE 256
 
-#define	OLD_LDA		[fp, #0 ]
-#define	X		[fp, #4 ]
-#define	OLD_INC_X	[fp, #8 ]
-#define	Y		[fp, #12 ]
-#define	OLD_INC_Y	[fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR	[fp, #0 ]
+#define OLD_ALPHAI	[fp, #8 ]
+#define OLD_A_SOFTFP	[fp, #16]
+#define OLD_LDA		[fp, #20]
+#define X		[fp, #24]
+#define OLD_INC_X	[fp, #28]
+#define Y		[fp, #32]
+#define OLD_INC_Y	[fp, #36]
+#else
+#define OLD_LDA		[fp, #0 ]
+#define X		[fp, #4 ]
+#define OLD_INC_X	[fp, #8 ]
+#define Y		[fp, #12 ]
+#define OLD_INC_Y	[fp, #16 ]
+#endif
+
 #define OLD_A		r3
 #define	OLD_N		r1
 
@@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	zgemvt_kernel_L999
 
+#if !defined(__ARM_PCS_VFP)
+	vldr	d0, OLD_ALPHAR
+	vldr	d1, OLD_ALPHAI
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_A, A
 	str	OLD_N, N
 

From eda9e8632ab7d94d609006612a4b760214dfa847 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Sun, 2 Jul 2017 02:00:48 +0530
Subject: [PATCH 25/42] generic: Bug fixes in generic 4x2 and 4x4 gemm kernels

---
 kernel/generic/gemmkernel_4x2.c | 30 +++++-----
 kernel/generic/gemmkernel_4x4.c | 98 ++++++++++++++++-----------------
 2 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c
index 1d15de1d7..8c784e2f1 100644
--- a/kernel/generic/gemmkernel_4x2.c
+++ b/kernel/generic/gemmkernel_4x2.c
@@ -154,11 +154,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		res1_0 *= alpha;
 		res1_1 *= alpha;
 
-		C0[0] = res0_0;
-		C0[1] = res0_1;
+		C0[0] += res0_0;
+		C0[1] += res0_1;
 
-		C1[0] = res1_0;
-		C1[1] = res1_1;
+		C1[0] += res1_0;
+		C1[1] += res1_1;
 
 		C0 = C0+2;
 		C1 = C1+2;
@@ -190,12 +190,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 
 		res1_0 *= alpha;
 
-		C0[0] = res0_0;
+		C0[0] += res0_0;
 
-		C1[0] = res1_0;
+		C1[0] += res1_0;
 
-		C0 = C0+1;
-		C1 = C1+1;
+		C0 += C0+1;
+		C1 += C1+1;
 
 	}
 
@@ -245,10 +245,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		res0_2 *= alpha;
 		res0_3 *= alpha;
 
-		C0[0] = res0_0;
-		C0[1] = res0_1;
-		C0[2] = res0_2;
-		C0[3] = res0_3;
+		C0[0] += res0_0;
+		C0[1] += res0_1;
+		C0[2] += res0_2;
+		C0[3] += res0_3;
 
 		C0 = C0+4;
 
@@ -278,8 +278,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		res0_0 *= alpha;
 		res0_1 *= alpha;
 
-		C0[0] = res0_0;
-		C0[1] = res0_1;
+		C0[0] += res0_0;
+		C0[1] += res0_1;
 
 		C0 = C0+2;
 
@@ -306,7 +306,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 
 		C0[0] = res0_0;
 
-		C0 = C0+1;
+		C0 += C0+1;
 
 	}
         k = (bk<<0);
diff --git a/kernel/generic/gemmkernel_4x4.c b/kernel/generic/gemmkernel_4x4.c
index bd67b3fc8..99bd9c1ef 100644
--- a/kernel/generic/gemmkernel_4x4.c
+++ b/kernel/generic/gemmkernel_4x4.c
@@ -152,25 +152,25 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		res3_2 *= alpha;
 		res3_3 *= alpha;
 
-		C0[0] = res0_0;
-		C0[1] = res0_1;
-		C0[2] = res0_2;
-		C0[3] = res0_3;
+		C0[0] += res0_0;
+		C0[1] += res0_1;
+		C0[2] += res0_2;
+		C0[3] += res0_3;
 
-		C1[0] = res1_0;
-		C1[1] = res1_1;
-		C1[2] = res1_2;
-		C1[3] = res1_3;
+		C1[0] += res1_0;
+		C1[1] += res1_1;
+		C1[2] += res1_2;
+		C1[3] += res1_3;
 
-		C2[0] = res2_0;
-		C2[1] = res2_1;
-		C2[2] = res2_2;
-		C2[3] = res2_3;
+		C2[0] += res2_0;
+		C2[1] += res2_1;
+		C2[2] += res2_2;
+		C2[3] += res2_3;
 
-		C3[0] = res3_0;
-		C3[1] = res3_1;
-		C3[2] = res3_2;
-		C3[3] = res3_3;
+		C3[0] += res3_0;
+		C3[1] += res3_1;
+		C3[2] += res3_2;
+		C3[3] += res3_3;
 
 		C0 = C0+4;
 		C1 = C1+4;
@@ -230,17 +230,17 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		res3_0 *= alpha;
 		res3_1 *= alpha;
 
-		C0[0] = res0_0;
-		C0[1] = res0_1;
+		C0[0] += res0_0;
+		C0[1] += res0_1;
 
-		C1[0] = res1_0;
-		C1[1] = res1_1;
+		C1[0] += res1_0;
+		C1[1] += res1_1;
 
-		C2[0] = res2_0;
-		C2[1] = res2_1;
+		C2[0] += res2_0;
+		C2[1] += res2_1;
 
-		C3[0] = res3_0;
-		C3[1] = res3_1;
+		C3[0] += res3_0;
+		C3[1] += res3_1;
 
 		C0 = C0+2;
 		C1 = C1+2;
@@ -283,13 +283,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 
 		res3_0 *= alpha;
 
-		C0[0] = res0_0;
+		C0[0] += res0_0;
 
-		C1[0] = res1_0;
+		C1[0] += res1_0;
 
-		C2[0] = res2_0;
+		C2[0] += res2_0;
 
-		C3[0] = res3_0;
+		C3[0] += res3_0;
 
 		C0 = C0+1;
 		C1 = C1+1;
@@ -360,15 +360,15 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		res1_2 *= alpha;
 		res1_3 *= alpha;
 
-		C0[0] = res0_0;
-		C0[1] = res0_1;
-		C0[2] = res0_2;
-		C0[3] = res0_3;
+		C0[0] += res0_0;
+		C0[1] += res0_1;
+		C0[2] += res0_2;
+		C0[3] += res0_3;
 
-		C1[0] = res1_0;
-		C1[1] = res1_1;
-		C1[2] = res1_2;
-		C1[3] = res1_3;
+		C1[0] += res1_0;
+		C1[1] += res1_1;
+		C1[2] += res1_2;
+		C1[3] += res1_3;
 
 		C0 = C0+4;
 		C1 = C1+4;
@@ -408,11 +408,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		res1_0 *= alpha;
 		res1_1 *= alpha;
 
-		C0[0] = res0_0;
-		C0[1] = res0_1;
+		C0[0] += res0_0;
+		C0[1] += res0_1;
 
-		C1[0] = res1_0;
-		C1[1] = res1_1;
+		C1[0] += res1_0;
+		C1[1] += res1_1;
 
 		C0 = C0+2;
 		C1 = C1+2;
@@ -444,9 +444,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 
 		res1_0 *= alpha;
 
-		C0[0] = res0_0;
+		C0[0] += res0_0;
 
-		C1[0] = res1_0;
+		C1[0] += res1_0;
 
 		C0 = C0+1;
 		C1 = C1+1;
@@ -499,10 +499,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		res0_2 *= alpha;
 		res0_3 *= alpha;
 
-		C0[0] = res0_0;
-		C0[1] = res0_1;
-		C0[2] = res0_2;
-		C0[3] = res0_3;
+		C0[0] += res0_0;
+		C0[1] += res0_1;
+		C0[2] += res0_2;
+		C0[3] += res0_3;
 
 		C0 = C0+4;
 
@@ -532,8 +532,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		res0_0 *= alpha;
 		res0_1 *= alpha;
 
-		C0[0] = res0_0;
-		C0[1] = res0_1;
+		C0[0] += res0_0;
+		C0[1] += res0_1;
 
 		C0 = C0+2;
 
@@ -558,7 +558,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 
 		res0_0 *= alpha;
 
-		C0[0] = res0_0;
+		C0[0] += res0_0;
 
 		C0 = C0+1;
 

From 872a11a2bfd90225d5ace725b0ec4f59bd9291f3 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Sun, 2 Jul 2017 02:05:48 +0530
Subject: [PATCH 26/42] arm: add softfp support in sgemm/strmm vfp kernels

---
 kernel/arm/KERNEL.ARMV6             |  4 ++--
 kernel/arm/KERNEL.ARMV7             |  3 +--
 kernel/arm/sgemm_kernel_4x2_vfp.S   | 12 ++++++++++++
 kernel/arm/sgemm_kernel_4x4_vfpv3.S | 29 +++++++++++------------------
 kernel/arm/strmm_kernel_4x2_vfp.S   | 13 +++++++++++++
 kernel/arm/strmm_kernel_4x4_vfpv3.S | 13 +++++++++++++
 6 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 022a93183..18d9869de 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -73,6 +73,7 @@ CGEMVTKERNEL = cgemv_t_vfp.S
 ZGEMVTKERNEL = zgemv_t_vfp.S
 
 SGEMMKERNEL    = ../generic/gemmkernel_4x2.c
+SGEMMKERNEL    = sgemm_kernel_4x2_vfp.S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
 SGEMMITCOPY    = sgemm_tcopy_4_vfp.S
@@ -97,6 +98,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
 STRMMKERNEL    = ../generic/trmmkernel_4x2.c
+STRMMKERNEL    = strmm_kernel_4x2_vfp.S
 DTRMMKERNEL    = ../generic/trmmkernel_4x2.c
 
 CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
@@ -111,12 +113,10 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-STRMMKERNEL    = strmm_kernel_4x2_vfp.S
 DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
 CTRMMKERNEL    = ctrmm_kernel_2x2_vfp.S
 ZTRMMKERNEL    = ztrmm_kernel_2x2_vfp.S
 
-SGEMMKERNEL    = sgemm_kernel_4x2_vfp.S
 DGEMMKERNEL    = dgemm_kernel_4x2_vfp.S
 CGEMMKERNEL    = cgemm_kernel_2x2_vfp.S
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfp.S
diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7
index 0872cb8cd..e2044133d 100644
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -11,7 +11,7 @@ DGEMVNKERNEL = gemv_n_vfpv3.S
 STRMMKERNEL    = ../generic/trmmkernel_4x4.c
 DTRMMKERNEL    = ../generic/trmmkernel_4x4.c
 
-SGEMMKERNEL    = ../generic/gemmkernel_4x4.c
+SGEMMKERNEL    =  sgemm_kernel_4x4_vfpv3.S
 SGEMMONCOPY    =  sgemm_ncopy_4_vfp.S
 SGEMMOTCOPY    =  sgemm_tcopy_4_vfp.S
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
@@ -30,7 +30,6 @@ DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
 CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
 ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S
 
-SGEMMKERNEL    =  sgemm_kernel_4x4_vfpv3.S
 DGEMMKERNEL    =  dgemm_kernel_4x4_vfpv3.S
 
 CGEMMKERNEL    = cgemm_kernel_2x2_vfpv3.S
diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S
index e8b44b742..1f21e5a1f 100644
--- a/kernel/arm/sgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/sgemm_kernel_4x2_vfp.S
@@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ALPHA	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	r3
+#define OLD_A_SOFTFP	[fp, #4 ]
+#define B	[fp, #8 ]
+#define C	[fp, #12 ]
+#define OLD_LDC	[fp, #16 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S
index 86198ac90..6491d3571 100644
--- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S
@@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_K	r2
-
-#ifdef ARM_SOFTFP_ABI
-#define OLD_ALPHA r3
-//#define OLD_A	
-#else //hard
 #define	OLD_A	r3
 #define OLD_ALPHA s0
-#endif
 
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
@@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define M	[fp, #-256 ]
 #define N	[fp, #-260 ]
 #define K	[fp, #-264 ]
-
-#ifndef ARM_SOFTFP_ABI
 #define A	[fp, #-268 ]
-#endif
 
 #define FP_ZERO [fp, #-240]
 #define FP_ZERO_0 [fp, #-240]
@@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ALPHA   [fp, #-280]
 
-#ifdef ARM_SOFTFP_ABI
-#define A	[fp, #4 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	r3
+#define OLD_A_SOFTFP	[fp, #4 ]
 #define B	[fp, #8 ]
 #define C	[fp, #12 ]
 #define OLD_LDC	[fp, #16 ]
-#else //hard
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #endif
-	
+
 #define I	r0
 #define J	r1
 #define L	r2
@@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
+
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
-	
-#ifdef ARM_SOFTFP_ABI
-	str	OLD_ALPHA, ALPHA
-#else //hard
 	str	OLD_A, A
 	vstr	OLD_ALPHA, ALPHA
-#endif
+
 	sub	r3, fp, #128
 	vstm	r3, { s8 - s31} 				// store floating point registers
 
diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S
index 8f97644ec..635b1dd13 100644
--- a/kernel/arm/strmm_kernel_4x2_vfp.S
+++ b/kernel/arm/strmm_kernel_4x2_vfp.S
@@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ALPHA	[fp, #-276 ]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	r3
+#define OLD_A_SOFTFP	[fp, #4 ]
+#define B	[fp, #8 ]
+#define OLD_C	[fp, #12 ]
+#define OLD_LDC	[fp, #16 ]
+#define OFFSET  [fp, #20 ]
+#else
 #define B	[fp, #4 ]
 #define OLD_C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S
index 0dd03ac85..e24d24eba 100644
--- a/kernel/arm/strmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S
@@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ALPHA	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	r3
+#define OLD_A_SOFTFP	[fp, #4 ]
+#define B	[fp, #8 ]
+#define C	[fp, #12 ]
+#define OLD_LDC	[fp, #16 ]
+#define OFFSET  [fp, #20 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K

From 09bc6ebe5b26aecd405a25dad2fa2934642fc827 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Sun, 2 Jul 2017 02:24:38 +0530
Subject: [PATCH 27/42] arm: add softfp support in dgemm/dtrmm vfp kernels

---
 kernel/arm/KERNEL.ARMV6             |  8 ++------
 kernel/arm/KERNEL.ARMV7             | 10 +++-------
 kernel/arm/dgemm_kernel_4x2_vfp.S   | 13 ++++++++++++-
 kernel/arm/dgemm_kernel_4x4_vfpv3.S | 12 ++++++++++++
 kernel/arm/dtrmm_kernel_4x2_vfp.S   | 13 +++++++++++++
 kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 13 +++++++++++++
 6 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 18d9869de..622085b45 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -72,7 +72,6 @@ DGEMVTKERNEL = gemv_t_vfp.S
 CGEMVTKERNEL = cgemv_t_vfp.S
 ZGEMVTKERNEL = zgemv_t_vfp.S
 
-SGEMMKERNEL    = ../generic/gemmkernel_4x2.c
 SGEMMKERNEL    = sgemm_kernel_4x2_vfp.S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
@@ -85,7 +84,7 @@ SGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
 SGEMMONCOPYOBJ = sgemm_oncopy.o
 SGEMMOTCOPYOBJ = sgemm_otcopy.o
 
-DGEMMKERNEL    = ../generic/gemmkernel_4x2.c
+DGEMMKERNEL    = dgemm_kernel_4x2_vfp.S
 ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
 DGEMMINCOPY    = dgemm_ncopy_4_vfp.S
 DGEMMITCOPY    = dgemm_tcopy_4_vfp.S
@@ -97,9 +96,8 @@ DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
 DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
-STRMMKERNEL    = ../generic/trmmkernel_4x2.c
 STRMMKERNEL    = strmm_kernel_4x2_vfp.S
-DTRMMKERNEL    = ../generic/trmmkernel_4x2.c
+DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
 
 CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
 CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
@@ -113,11 +111,9 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
 CTRMMKERNEL    = ctrmm_kernel_2x2_vfp.S
 ZTRMMKERNEL    = ztrmm_kernel_2x2_vfp.S
 
-DGEMMKERNEL    = dgemm_kernel_4x2_vfp.S
 CGEMMKERNEL    = cgemm_kernel_2x2_vfp.S
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfp.S
 
diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7
index e2044133d..63c468e66 100644
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -8,8 +8,8 @@ ZNRM2KERNEL  = nrm2_vfpv3.S
 SGEMVNKERNEL = gemv_n_vfpv3.S
 DGEMVNKERNEL = gemv_n_vfpv3.S
 
-STRMMKERNEL    = ../generic/trmmkernel_4x4.c
-DTRMMKERNEL    = ../generic/trmmkernel_4x4.c
+STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
+DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
 
 SGEMMKERNEL    =  sgemm_kernel_4x4_vfpv3.S
 SGEMMONCOPY    =  sgemm_ncopy_4_vfp.S
@@ -17,7 +17,7 @@ SGEMMOTCOPY    =  sgemm_tcopy_4_vfp.S
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 
-DGEMMKERNEL    = ../generic/gemmkernel_4x4.c
+DGEMMKERNEL    =  dgemm_kernel_4x4_vfpv3.S
 DGEMMONCOPY    =  dgemm_ncopy_4_vfp.S
 DGEMMOTCOPY    =  dgemm_tcopy_4_vfp.S
 DGEMMONCOPYOBJ = dgemm_oncopy.o
@@ -25,13 +25,9 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
-DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
 CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
 ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S
 
-DGEMMKERNEL    =  dgemm_kernel_4x4_vfpv3.S
-
 CGEMMKERNEL    = cgemm_kernel_2x2_vfpv3.S
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfpv3.S
 
diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S
index 183269d1b..001a6050c 100644
--- a/kernel/arm/dgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/dgemm_kernel_4x2_vfp.S
@@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ALPHA	[fp, #-280]
 
-
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #12 ]
+#define B	[fp, #16 ]
+#define C	[fp, #20 ]
+#define OLD_LDC	[fp, #24 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S
index b14052e06..1744b54d8 100644
--- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S
@@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ALPHA	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #12 ]
+#define B	[fp, #16 ]
+#define C	[fp, #20 ]
+#define OLD_LDC	[fp, #24 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S
index c578d2b1e..3d6fbf8e9 100644
--- a/kernel/arm/dtrmm_kernel_4x2_vfp.S
+++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S
@@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ALPHA	[fp, #-276 ]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #12 ]
+#define B	[fp, #16 ]
+#define OLD_C	[fp, #20 ]
+#define OLD_LDC	[fp, #24 ]
+#define OFFSET  [fp, #28 ]
+#else
 #define B	[fp, #4 ]
 #define OLD_C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
index c7e455f16..c0c6a1677 100644
--- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
@@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ALPHA	[fp, #-276 ]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #12 ]
+#define B	[fp, #16 ]
+#define OLD_C	[fp, #20 ]
+#define OLD_LDC	[fp, #24 ]
+#define OFFSET  [fp, #28 ]
+#else
 #define B	[fp, #4 ]
 #define OLD_C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K

From 305cd2e8b41f4daccdfa1e6631bce7f7133faf92 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Sun, 2 Jul 2017 02:42:32 +0530
Subject: [PATCH 28/42] arm: add softfp support in cgemm/ctrmm vfp kernels

---
 kernel/arm/KERNEL.ARMV6             |  4 ++--
 kernel/arm/KERNEL.ARMV7             |  5 +++--
 kernel/arm/cgemm_kernel_2x2_vfp.S   | 14 ++++++++++++++
 kernel/arm/cgemm_kernel_2x2_vfpv3.S | 14 ++++++++++++++
 kernel/arm/ctrmm_kernel_2x2_vfp.S   | 15 +++++++++++++++
 kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 15 +++++++++++++++
 6 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 622085b45..e8fc3df73 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -98,7 +98,9 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
 STRMMKERNEL    = strmm_kernel_4x2_vfp.S
 DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
+CTRMMKERNEL    = ctrmm_kernel_2x2_vfp.S
 
+CGEMMKERNEL    = cgemm_kernel_2x2_vfp.S
 CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
 CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
 CGEMMONCOPYOBJ = cgemm_oncopy.o
@@ -111,10 +113,8 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
 ifeq ($(ARM_ABI),hard)
 
-CTRMMKERNEL    = ctrmm_kernel_2x2_vfp.S
 ZTRMMKERNEL    = ztrmm_kernel_2x2_vfp.S
 
-CGEMMKERNEL    = cgemm_kernel_2x2_vfp.S
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfp.S
 
 endif
diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7
index 63c468e66..4bfe18d1d 100644
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -10,6 +10,7 @@ DGEMVNKERNEL = gemv_n_vfpv3.S
 
 STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
 DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
+CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
 
 SGEMMKERNEL    =  sgemm_kernel_4x4_vfpv3.S
 SGEMMONCOPY    =  sgemm_ncopy_4_vfp.S
@@ -23,12 +24,12 @@ DGEMMOTCOPY    =  dgemm_tcopy_4_vfp.S
 DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
+CGEMMKERNEL    = cgemm_kernel_2x2_vfpv3.S
+
 ifeq ($(ARM_ABI),hard)
 
-CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
 ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S
 
-CGEMMKERNEL    = cgemm_kernel_2x2_vfpv3.S
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfpv3.S
 
 endif
diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S
index f0517cb47..512eea387 100644
--- a/kernel/arm/cgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfp.S
@@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	r3
+#define OLD_ALPHAI_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define B	[fp, #12 ]
+#define C	[fp, #16 ]
+#define OLD_LDC	[fp, #20 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
index cf132a184..42eb53a55 100644
--- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
@@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	r3
+#define OLD_ALPHAI_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define B	[fp, #12 ]
+#define C	[fp, #16 ]
+#define OLD_LDC	[fp, #20 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S
index 8cb7ede9d..95578b10a 100644
--- a/kernel/arm/ctrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S
@@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	r3
+#define OLD_ALPHAI_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define B	[fp, #12 ]
+#define C	[fp, #16 ]
+#define OLD_LDC	[fp, #20 ]
+#define OFFSET  [fp, #24 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
index 97bd88c69..18beb4e47 100644
--- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	r3
+#define OLD_ALPHAI_SOFTFP	[fp, #4]
+#define OLD_A_SOFTFP	[fp, #8 ]
+#define B	[fp, #12 ]
+#define C	[fp, #16 ]
+#define OLD_LDC	[fp, #20 ]
+#define OFFSET  [fp, #24 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K

From 97d671eb610de8cd73fa90923bfbed87d1d8ffef Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Sun, 2 Jul 2017 02:54:32 +0530
Subject: [PATCH 29/42] arm: add softfp support in zgemm/ztrmm vfp kernels

---
 kernel/arm/KERNEL.ARMV6             | 13 ++++---------
 kernel/arm/KERNEL.ARMV7             | 15 +++++----------
 kernel/arm/zgemm_kernel_2x2_vfp.S   | 14 ++++++++++++++
 kernel/arm/zgemm_kernel_2x2_vfpv3.S | 14 ++++++++++++++
 kernel/arm/ztrmm_kernel_2x2_vfp.S   | 15 +++++++++++++++
 kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 15 +++++++++++++++
 6 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index e8fc3df73..960dae67b 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -96,25 +96,20 @@ DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
 DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
-STRMMKERNEL    = strmm_kernel_4x2_vfp.S
-DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
-CTRMMKERNEL    = ctrmm_kernel_2x2_vfp.S
-
 CGEMMKERNEL    = cgemm_kernel_2x2_vfp.S
 CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
 CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
 CGEMMONCOPYOBJ = cgemm_oncopy.o
 CGEMMOTCOPYOBJ = cgemm_otcopy.o
 
+ZGEMMKERNEL    = zgemm_kernel_2x2_vfp.S
 ZGEMMONCOPY    = zgemm_ncopy_2_vfp.S
 ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
 ZGEMMONCOPYOBJ = zgemm_oncopy.o
 ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
-ifeq ($(ARM_ABI),hard)
-
+STRMMKERNEL    = strmm_kernel_4x2_vfp.S
+DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
+CTRMMKERNEL    = ctrmm_kernel_2x2_vfp.S
 ZTRMMKERNEL    = ztrmm_kernel_2x2_vfp.S
 
-ZGEMMKERNEL    = zgemm_kernel_2x2_vfp.S
-
-endif
diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7
index 4bfe18d1d..5e0b4cfb8 100644
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -8,10 +8,6 @@ ZNRM2KERNEL  = nrm2_vfpv3.S
 SGEMVNKERNEL = gemv_n_vfpv3.S
 DGEMVNKERNEL = gemv_n_vfpv3.S
 
-STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
-DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
-CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
-
 SGEMMKERNEL    =  sgemm_kernel_4x4_vfpv3.S
 SGEMMONCOPY    =  sgemm_ncopy_4_vfp.S
 SGEMMOTCOPY    =  sgemm_tcopy_4_vfp.S
@@ -25,11 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
 CGEMMKERNEL    = cgemm_kernel_2x2_vfpv3.S
-
-ifeq ($(ARM_ABI),hard)
-
-ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S
-
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfpv3.S
 
-endif
+STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
+DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
+CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
+ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S
+
diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S
index 46507c4d2..618f09781 100644
--- a/kernel/arm/zgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfp.S
@@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	[fp, #4]
+#define OLD_ALPHAI_SOFTFP	[fp, #12]
+#define OLD_A_SOFTFP	[fp, #20 ]
+#define B	[fp, #24 ]
+#define C	[fp, #28 ]
+#define OLD_LDC	[fp, #32 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
index 5a99f792f..0fe0c1993 100644
--- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
@@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	[fp, #4]
+#define OLD_ALPHAI_SOFTFP	[fp, #12]
+#define OLD_A_SOFTFP	[fp, #20 ]
+#define B	[fp, #24 ]
+#define C	[fp, #28 ]
+#define OLD_LDC	[fp, #32 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S
index dc80b17b8..78d09a9c7 100644
--- a/kernel/arm/ztrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	[fp, #4]
+#define OLD_ALPHAI_SOFTFP	[fp, #12]
+#define OLD_A_SOFTFP	[fp, #20 ]
+#define B	[fp, #24 ]
+#define C	[fp, #28 ]
+#define OLD_LDC	[fp, #32 ]
+#define OFFSET  [fp, #36 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
index 5a808ccbc..bf72ce605 100644
--- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]
 
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP	[fp, #4]
+#define OLD_ALPHAI_SOFTFP	[fp, #12]
+#define OLD_A_SOFTFP	[fp, #20 ]
+#define B	[fp, #24 ]
+#define C	[fp, #28 ]
+#define OLD_LDC	[fp, #32 ]
+#define OFFSET  [fp, #36 ]
+#else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
+#endif
 
 #define I	r0
 #define J	r1
@@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 
+#if !defined(__ARM_PCS_VFP)
+	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+	ldr	OLD_A, OLD_A_SOFTFP
+#endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K

From 37efb5bc1d9b78e5e612b5aad896981d58a5d18f Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Sun, 2 Jul 2017 03:06:36 +0530
Subject: [PATCH 30/42] arm: Remove unnecessary files/code

Since softfp code has been added to all required vfp kernels,
the code for auto detection of abi is no longer required.

The option to force softfp ABI on make command line by giving
ARM_SOFTFP_ABI=1 is retained. But there is no need to give this option
anymore.

Also the newly added C versions of 4x4/4x2 gemm/trmm kernels are removed.
These are longer required. Moreover these kernels has bugs.
---
 Makefile.system                 |  19 +-
 c_check                         |  16 +-
 kernel/generic/gemmkernel_4x2.c | 317 ------------------
 kernel/generic/gemmkernel_4x4.c | 571 --------------------------------
 kernel/generic/trmmkernel_4x2.c | 528 -----------------------------
 5 files changed, 8 insertions(+), 1443 deletions(-)
 delete mode 100644 kernel/generic/gemmkernel_4x2.c
 delete mode 100644 kernel/generic/gemmkernel_4x4.c
 delete mode 100644 kernel/generic/trmmkernel_4x2.c

diff --git a/Makefile.system b/Makefile.system
index 2cae5f1c9..4face0e51 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -487,19 +487,14 @@ ifeq ($(ARCH), arm)
 NO_BINARY_MODE  = 1
 BINARY_DEFINED  = 1
 
-# If ABI is specified on command line use it. Else use the automatically detected ABI.
-ifeq ($(ARM_SOFTFP_ABI),1)
-ARM_ABI = softfp
-else
-ifeq ($(ARM_HARD_ABI),1)
-ARM_ABI = hard
-else
-ARM_ABI=$(ARM_ABI_AUTO)
+CCOMMON_OPT += -marm
+FCOMMON_OPT += -marm
+
+# If softfp abi is mentioned on the command line, force it.
+ifeq ($(ARM_SOFTFP_ABI), 1)
+CCOMMON_OPT += -mfloat-abi=softfp
+FCOMMON_OPT += -mfloat-abi=softfp
 endif
-endif
-export ARM_ABI_AUTO
-CCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI)
-FCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI)
 
 endif
 
diff --git a/c_check b/c_check
index 2e7e08cfb..20da288be 100644
--- a/c_check
+++ b/c_check
@@ -94,17 +94,7 @@ if ($architecture eq "mips64") {
     $defined = 1;
 }
 
-if ($architecture eq "arm") {
-    $defined = 1;
-    $data = `$compiler_name -dM -E ctest2.c | grep -w __ARM_PCS_VFP`;
-    if ($data ne "") {
-	$abi = "hard";
-    } else {
-	$abi = "softfp";
-    }
-}
-
-if ($architecture eq "arm64") {
+if (($architecture eq "arm") || ($architecture eq "arm64")) {
     $defined = 1;
 }
 
@@ -297,10 +287,6 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
 print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
 print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
 
-if ($architecture eq "arm") {
-	print MAKEFILE "ARM_ABI_AUTO=$abi\n";
-}
-
 $os           =~ tr/[a-z]/[A-Z]/;
 $architecture =~ tr/[a-z]/[A-Z]/;
 $compiler     =~ tr/[a-z]/[A-Z]/;
diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c
deleted file mode 100644
index 8c784e2f1..000000000
--- a/kernel/generic/gemmkernel_4x2.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/***************************************************************************
-Copyright (c) 2017, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-#include <stdbool.h>
-
-int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc)
-{
-
-   BLASLONG i,j,k;
-   FLOAT *C0,*C1,*ptrba,*ptrbb;
-
-   FLOAT res0_0;
-   FLOAT res0_1;
-   FLOAT res0_2;
-   FLOAT res0_3;
-
-   FLOAT res1_0;
-   FLOAT res1_1;
-   FLOAT res1_2;
-   FLOAT res1_3;
-
-   FLOAT a0;
-   FLOAT a1;
-
-   FLOAT b0;
-   FLOAT b1;
-
-   for (j=0; j<(bn/2); j+=2)
-   {
-        C0 = C;
-        C1 = C0+ldc;
-
-        ptrba = ba;
-
-        for (i=0; i<bm/4; i+=1)
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res0_1 = 0;
-		res0_2 = 0;
-		res0_3 = 0;
-
-		res1_0 = 0;
-		res1_1 = 0;
-		res1_2 = 0;
-		res1_3 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-			res1_1 += a1*b1;
-
-			a0 = ptrba[2];
-			res0_2 += a0*b0;
-			res1_2 += a0*b1;
-
-			a1 = ptrba[3];
-			res0_3 += a1*b0;
-			res1_3 += a1*b1;
-
-			ptrba = ptrba+4;
-			ptrbb = ptrbb+2;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-		res0_2 *= alpha;
-		res0_3 *= alpha;
-
-		res1_0 *= alpha;
-		res1_1 *= alpha;
-		res1_2 *= alpha;
-		res1_3 *= alpha;
-
-		C0[0] += res0_0;
-		C0[1] += res0_1;
-		C0[2] += res0_2;
-		C0[3] += res0_3;
-
-		C1[0] += res1_0;
-		C1[1] += res1_1;
-		C1[2] += res1_2;
-		C1[3] += res1_3;
-
-		C0 = C0+4;
-		C1 = C1+4;
-
-	}
-
-	if ( bm & 2 )
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res0_1 = 0;
-
-		res1_0 = 0;
-		res1_1 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-			res1_1 += a1*b1;
-
-			ptrba = ptrba+2;
-			ptrbb = ptrbb+2;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-
-		res1_0 *= alpha;
-		res1_1 *= alpha;
-
-		C0[0] += res0_0;
-		C0[1] += res0_1;
-
-		C1[0] += res1_0;
-		C1[1] += res1_1;
-
-		C0 = C0+2;
-		C1 = C1+2;
-
-	}
-
-	if ( bm & 1 )
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-
-		res1_0 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-
-			ptrba = ptrba+1;
-			ptrbb = ptrbb+2;
-                }
-
-		res0_0 *= alpha;
-
-		res1_0 *= alpha;
-
-		C0[0] += res0_0;
-
-		C1[0] += res1_0;
-
-		C0 += C0+1;
-		C1 += C1+1;
-
-	}
-
-        k = (bk<<1);
-        bb = bb+k;
-        i = (ldc<<1);
-        C = C+i;
-    }
-
-   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
-   {
-        C0 = C;
-
-        ptrba = ba;
-
-        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res0_1 = 0;
-		res0_2 = 0;
-		res0_3 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-
-			a0 = ptrba[2];
-			res0_2 += a0*b0;
-
-			a1 = ptrba[3];
-			res0_3 += a1*b0;
-
-			ptrba = ptrba+4;
-			ptrbb = ptrbb+1;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-		res0_2 *= alpha;
-		res0_3 *= alpha;
-
-		C0[0] += res0_0;
-		C0[1] += res0_1;
-		C0[2] += res0_2;
-		C0[3] += res0_3;
-
-		C0 = C0+4;
-
-	}
-
-	if ( bm & 2 ) // do any 2x1 loop
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res0_1 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-
-			ptrba = ptrba+2;
-			ptrbb = ptrbb+1;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-
-		C0[0] += res0_0;
-		C0[1] += res0_1;
-
-		C0 = C0+2;
-
-	}
-
-	if ( bm & 1 ) // do any 1x1 loop
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-
-			ptrba = ptrba+1;
-			ptrbb = ptrbb+1;
-                }
-
-		res0_0 *= alpha;
-
-		C0[0] = res0_0;
-
-		C0 += C0+1;
-
-	}
-        k = (bk<<0);
-        bb = bb+k;
-        C = C+ldc;
-   }
-   return 0;
-}
diff --git a/kernel/generic/gemmkernel_4x4.c b/kernel/generic/gemmkernel_4x4.c
deleted file mode 100644
index 99bd9c1ef..000000000
--- a/kernel/generic/gemmkernel_4x4.c
+++ /dev/null
@@ -1,571 +0,0 @@
-/***************************************************************************
-Copyright (c) 2017, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-#include <stdbool.h>
-
-int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc)
-{
-
-   BLASLONG i,j,k;
-   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
-
-   FLOAT res0_0;
-   FLOAT res0_1;
-   FLOAT res0_2;
-   FLOAT res0_3;
-
-   FLOAT res1_0;
-   FLOAT res1_1;
-   FLOAT res1_2;
-   FLOAT res1_3;
-
-   FLOAT res2_0;
-   FLOAT res2_1;
-   FLOAT res2_2;
-   FLOAT res2_3;
-
-   FLOAT res3_0;
-   FLOAT res3_1;
-   FLOAT res3_2;
-   FLOAT res3_3;
-
-   FLOAT a0;
-   FLOAT a1;
-
-   FLOAT b0;
-   FLOAT b1;
-   FLOAT b2;
-   FLOAT b3;
-
-
-   for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops 
-   {
-        C0 = C;
-        C1 = C0+ldc;
-        C2 = C1+ldc;
-        C3 = C2+ldc;
-
-        ptrba = ba;
-
-        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
-	{
-
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res0_1 = 0;
-		res0_2 = 0;
-		res0_3 = 0;
-
-		res1_0 = 0;
-		res1_1 = 0;
-		res1_2 = 0;
-		res1_3 = 0;
-
-		res2_0 = 0;
-		res2_1 = 0;
-		res2_2 = 0;
-		res2_3 = 0;
-
-		res3_0 = 0;
-		res3_1 = 0;
-		res3_2 = 0;
-		res3_3 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-			b2 = ptrbb[2];
-			b3 = ptrbb[3];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-			res2_0 += a0*b2;
-			res3_0 += a0*b3;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-			res1_1 += a1*b1;
-			res2_1 += a1*b2;
-			res3_1 += a1*b3;
-
-			a0 = ptrba[2];
-			res0_2 += a0*b0;
-			res1_2 += a0*b1;
-			res2_2 += a0*b2;
-			res3_2 += a0*b3;
-
-			a1 = ptrba[3];
-			res0_3 += a1*b0;
-			res1_3 += a1*b1;
-			res2_3 += a1*b2;
-			res3_3 += a1*b3;
-
-			ptrba = ptrba+4;
-			ptrbb = ptrbb+4;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-		res0_2 *= alpha;
-		res0_3 *= alpha;
-
-		res1_0 *= alpha;
-		res1_1 *= alpha;
-		res1_2 *= alpha;
-		res1_3 *= alpha;
-
-		res2_0 *= alpha;
-		res2_1 *= alpha;
-		res2_2 *= alpha;
-		res2_3 *= alpha;
-
-		res3_0 *= alpha;
-		res3_1 *= alpha;
-		res3_2 *= alpha;
-		res3_3 *= alpha;
-
-		C0[0] += res0_0;
-		C0[1] += res0_1;
-		C0[2] += res0_2;
-		C0[3] += res0_3;
-
-		C1[0] += res1_0;
-		C1[1] += res1_1;
-		C1[2] += res1_2;
-		C1[3] += res1_3;
-
-		C2[0] += res2_0;
-		C2[1] += res2_1;
-		C2[2] += res2_2;
-		C2[3] += res2_3;
-
-		C3[0] += res3_0;
-		C3[1] += res3_1;
-		C3[2] += res3_2;
-		C3[3] += res3_3;
-
-		C0 = C0+4;
-		C1 = C1+4;
-		C2 = C2+4;
-		C3 = C3+4;
-
-	}
-
-	if ( bm & 2 ) // do any 2x4 loop
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res0_1 = 0;
-
-		res1_0 = 0;
-		res1_1 = 0;
-
-		res2_0 = 0;
-		res2_1 = 0;
-
-		res3_0 = 0;
-		res3_1 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-			b2 = ptrbb[2];
-			b3 = ptrbb[3];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-			res2_0 += a0*b2;
-			res3_0 += a0*b3;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-			res1_1 += a1*b1;
-			res2_1 += a1*b2;
-			res3_1 += a1*b3;
-
-			ptrba = ptrba+2;
-			ptrbb = ptrbb+4;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-
-		res1_0 *= alpha;
-		res1_1 *= alpha;
-
-		res2_0 *= alpha;
-		res2_1 *= alpha;
-
-		res3_0 *= alpha;
-		res3_1 *= alpha;
-
-		C0[0] += res0_0;
-		C0[1] += res0_1;
-
-		C1[0] += res1_0;
-		C1[1] += res1_1;
-
-		C2[0] += res2_0;
-		C2[1] += res2_1;
-
-		C3[0] += res3_0;
-		C3[1] += res3_1;
-
-		C0 = C0+2;
-		C1 = C1+2;
-		C2 = C2+2;
-		C3 = C3+2;
-
-	}
-
-	if ( bm & 1 ) // do any 1x4 loop
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res1_0 = 0;
-		res2_0 = 0;
-		res3_0 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-			b2 = ptrbb[2];
-			b3 = ptrbb[3];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-			res2_0 += a0*b2;
-			res3_0 += a0*b3;
-
-			ptrba = ptrba+1;
-			ptrbb = ptrbb+4;
-                }
-
-		res0_0 *= alpha;
-
-		res1_0 *= alpha;
-
-		res2_0 *= alpha;
-
-		res3_0 *= alpha;
-
-		C0[0] += res0_0;
-
-		C1[0] += res1_0;
-
-		C2[0] += res2_0;
-
-		C3[0] += res3_0;
-
-		C0 = C0+1;
-		C1 = C1+1;
-		C2 = C2+1;
-		C3 = C3+1;
-
-	}
-
-        k = (bk<<2);
-        bb = bb+k;
-        i = (ldc<<2);
-        C = C+i;
-    }
-
-   for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
-   {
-        C0 = C;
-        C1 = C0+ldc;
-
-        ptrba = ba;
-
-        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res0_1 = 0;
-		res0_2 = 0;
-		res0_3 = 0;
-
-		res1_0 = 0;
-		res1_1 = 0;
-		res1_2 = 0;
-		res1_3 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-			res1_1 += a1*b1;
-
-			a0 = ptrba[2];
-			res0_2 += a0*b0;
-			res1_2 += a0*b1;
-
-			a1 = ptrba[3];
-			res0_3 += a1*b0;
-			res1_3 += a1*b1;
-
-			ptrba = ptrba+4;
-			ptrbb = ptrbb+2;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-		res0_2 *= alpha;
-		res0_3 *= alpha;
-
-		res1_0 *= alpha;
-		res1_1 *= alpha;
-		res1_2 *= alpha;
-		res1_3 *= alpha;
-
-		C0[0] += res0_0;
-		C0[1] += res0_1;
-		C0[2] += res0_2;
-		C0[3] += res0_3;
-
-		C1[0] += res1_0;
-		C1[1] += res1_1;
-		C1[2] += res1_2;
-		C1[3] += res1_3;
-
-		C0 = C0+4;
-		C1 = C1+4;
-
-	}
-
-	if ( bm & 2 ) // do any 2x2 loop
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res0_1 = 0;
-
-		res1_0 = 0;
-		res1_1 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-			res1_1 += a1*b1;
-
-			ptrba = ptrba+2;
-			ptrbb = ptrbb+2;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-
-		res1_0 *= alpha;
-		res1_1 *= alpha;
-
-		C0[0] += res0_0;
-		C0[1] += res0_1;
-
-		C1[0] += res1_0;
-		C1[1] += res1_1;
-
-		C0 = C0+2;
-		C1 = C1+2;
-
-	}
-
-	if ( bm & 1 ) // do any 1x2 loop
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-
-		res1_0 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-
-			ptrba = ptrba+1;
-			ptrbb = ptrbb+2;
-                }
-
-		res0_0 *= alpha;
-
-		res1_0 *= alpha;
-
-		C0[0] += res0_0;
-
-		C1[0] += res1_0;
-
-		C0 = C0+1;
-		C1 = C1+1;
-
-	}
-
-        k = (bk<<1);
-        bb = bb+k;
-        i = (ldc<<1);
-        C = C+i;
-    }
-
-   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
-   {
-        C0 = C;
-
-        ptrba = ba;
-
-        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res0_1 = 0;
-		res0_2 = 0;
-		res0_3 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-
-			a0 = ptrba[2];
-			res0_2 += a0*b0;
-
-			a1 = ptrba[3];
-			res0_3 += a1*b0;
-
-			ptrba = ptrba+4;
-			ptrbb = ptrbb+1;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-		res0_2 *= alpha;
-		res0_3 *= alpha;
-
-		C0[0] += res0_0;
-		C0[1] += res0_1;
-		C0[2] += res0_2;
-		C0[3] += res0_3;
-
-		C0 = C0+4;
-
-	}
-
-	if ( bm & 2 ) // do any 2x1 loop
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-		res0_1 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-
-			ptrba = ptrba+2;
-			ptrbb = ptrbb+1;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-
-		C0[0] += res0_0;
-		C0[1] += res0_1;
-
-		C0 = C0+2;
-
-	}
-
-	if ( bm & 1 ) // do any 1x1 loop
-	{
-		ptrbb = bb;
-
-		res0_0 = 0;
-
-		for (k=0; k<bk; k++)
-                {
-			b0 = ptrbb[0];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-
-			ptrba = ptrba+1;
-			ptrbb = ptrbb+1;
-                }
-
-		res0_0 *= alpha;
-
-		C0[0] += res0_0;
-
-		C0 = C0+1;
-
-	}
-        k = (bk<<0);
-        bb = bb+k;
-        C = C+ldc;
-   }
-   return 0;
-}
diff --git a/kernel/generic/trmmkernel_4x2.c b/kernel/generic/trmmkernel_4x2.c
deleted file mode 100644
index 6f8847e29..000000000
--- a/kernel/generic/trmmkernel_4x2.c
+++ /dev/null
@@ -1,528 +0,0 @@
-#include "common.h"
-#include <stdbool.h>
-
-int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
-{
-
-   BLASLONG i,j,k;
-   FLOAT *C0,*C1,*ptrba,*ptrbb;
-
-   FLOAT res0_0;
-   FLOAT res0_1;
-   FLOAT res0_2;
-   FLOAT res0_3;
-
-   FLOAT res1_0;
-   FLOAT res1_1;
-   FLOAT res1_2;
-   FLOAT res1_3;
-
-   FLOAT a0;
-   FLOAT a1;
-
-   FLOAT b0;
-   FLOAT b1;
-
-   BLASLONG off, temp;
-
-   bool left;
-   bool transposed;
-   bool backwards;
-
-#ifdef LEFT
-   left = true;
-#else
-   left = false;
-#endif
-
-#ifdef TRANSA
-   transposed = true;
-#else
-   transposed = false;
-#endif
-
-   backwards = left != transposed;
-
-   if (!left) {
-      off = -offset;
-   }
-
-   for (j=0; j<(bn/2); j+=2) // do the Mx2 loops 
-   {
-        C0 = C;
-        C1 = C0+ldc;
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-		off = offset;
-#endif
-
-
-        ptrba = ba;
-
-        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
-	{
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		ptrbb = bb;
-#else
-		ptrba += off*4;
-		ptrbb = bb + off*2;
-#endif
-
-		res0_0 = 0;
-		res0_1 = 0;
-		res0_2 = 0;
-		res0_3 = 0;
-
-		res1_0 = 0;
-		res1_1 = 0;
-		res1_2 = 0;
-		res1_3 = 0;
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-		temp = bk-off;
-#elif defined(LEFT)
-		temp = off+4;	// number of values in A
-#else
-		temp = off+2;	// number of values in B
-#endif
-
-		for (k=0; k<temp; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-			res1_1 += a1*b1;
-
-			a0 = ptrba[2];
-			res0_2 += a0*b0;
-			res1_2 += a0*b1;
-
-			a1 = ptrba[3];
-			res0_3 += a1*b0;
-			res1_3 += a1*b1;
-
-			ptrba = ptrba+4;
-			ptrbb = ptrbb+2;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-		res0_2 *= alpha;
-		res0_3 *= alpha;
-
-		res1_0 *= alpha;
-		res1_1 *= alpha;
-		res1_2 *= alpha;
-		res1_3 *= alpha;
-
-		C0[0] = res0_0;
-		C0[1] = res0_1;
-		C0[2] = res0_2;
-		C0[3] = res0_3;
-
-		C1[0] = res1_0;
-		C1[1] = res1_1;
-		C1[2] = res1_2;
-		C1[3] = res1_3;
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		temp = bk - off;
-#ifdef LEFT
-		temp -= 4; // number of values in A
-#else
-		temp -= 2; // number of values in B
-#endif
-		ptrba += temp*4;
-		ptrbb += temp*2;
-#endif
-
-#ifdef LEFT
-		off += 4; // number of values in A
-#endif
-
-		C0 = C0+4;
-		C1 = C1+4;
-
-	}
-
-	if ( bm & 2 ) // do any 2x2 loop
-	{
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		ptrbb = bb;
-#else
-		ptrba += off*2;
-		ptrbb = bb + off*2;
-#endif
-
-		res0_0 = 0;
-		res0_1 = 0;
-
-		res1_0 = 0;
-		res1_1 = 0;
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-		temp = bk-off;
-#elif defined(LEFT)
-		temp = off+2;	// number of values in A
-#else
-		temp = off+2;	// number of values in B
-#endif
-
-		for (k=0; k<temp; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-			res1_1 += a1*b1;
-
-			ptrba = ptrba+2;
-			ptrbb = ptrbb+2;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-
-		res1_0 *= alpha;
-		res1_1 *= alpha;
-
-		C0[0] = res0_0;
-		C0[1] = res0_1;
-
-		C1[0] = res1_0;
-		C1[1] = res1_1;
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		temp = bk - off;
-#ifdef LEFT
-		temp -= 2; // number of values in A
-#else
-		temp -= 2; // number of values in B
-#endif
-		ptrba += temp*2;
-		ptrbb += temp*2;
-#endif
-
-#ifdef LEFT
-		off += 2; // number of values in A
-#endif
-
-		C0 = C0+2;
-		C1 = C1+2;
-
-	}
-
-	if ( bm & 1 ) // do any 1x2 loop
-	{
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		ptrbb = bb;
-#else
-		ptrba += off*1;
-		ptrbb = bb + off*2;
-#endif
-
-		res0_0 = 0;
-
-		res1_0 = 0;
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-		temp = bk-off;
-#elif defined(LEFT)
-		temp = off+1;	// number of values in A
-#else
-		temp = off+2;	// number of values in B
-#endif
-
-		for (k=0; k<temp; k++)
-                {
-			b0 = ptrbb[0];
-			b1 = ptrbb[1];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-			res1_0 += a0*b1;
-
-			ptrba = ptrba+1;
-			ptrbb = ptrbb+2;
-                }
-
-		res0_0 *= alpha;
-
-		res1_0 *= alpha;
-
-		C0[0] = res0_0;
-
-		C1[0] = res1_0;
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		temp = bk - off;
-#ifdef LEFT
-		temp -= 1; // number of values in A
-#else
-		temp -= 2; // number of values in B
-#endif
-		ptrba += temp*1;
-		ptrbb += temp*2;
-#endif
-
-#ifdef LEFT
-		off += 1; // number of values in A
-#endif
-
-		C0 = C0+1;
-		C1 = C1+1;
-
-	}
-
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-		off += 2;
-#endif
-
-        k = (bk<<1);
-        bb = bb+k;
-        i = (ldc<<1);
-        C = C+i;
-    }
-
-
-
-
-
-
-
-   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
-   {
-        C0 = C;
-
-#if defined(TRMMKERNEL) &&  defined(LEFT)
-	off = offset;
-#endif
-
-        ptrba = ba;
-
-        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
-	{
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		ptrbb = bb;
-#else
-		ptrba += off*4;
-		ptrbb = bb + off*1;
-#endif
-
-		res0_0 = 0;
-		res0_1 = 0;
-		res0_2 = 0;
-		res0_3 = 0;
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-		temp = bk-off;
-#elif defined(LEFT)
-		temp = off+4;	// number of values in A
-#else
-		temp = off+1;	// number of values in B
-#endif
-
-		for (k=0; k<temp; k++)
-                {
-			b0 = ptrbb[0];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-
-			a0 = ptrba[2];
-			res0_2 += a0*b0;
-
-			a1 = ptrba[3];
-			res0_3 += a1*b0;
-
-			ptrba = ptrba+4;
-			ptrbb = ptrbb+1;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-		res0_2 *= alpha;
-		res0_3 *= alpha;
-
-		C0[0] = res0_0;
-		C0[1] = res0_1;
-		C0[2] = res0_2;
-		C0[3] = res0_3;
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		temp = bk - off;
-#ifdef LEFT
-		temp -= 4; // number of values in A
-#else
-		temp -= 1; // number of values in B
-#endif
-		ptrba += temp*4;
-		ptrbb += temp*1;
-#endif
-
-#ifdef LEFT
-		off += 4; // number of values in A
-#endif
-
-		C0 = C0+4;
-
-	}
-
-	if ( bm & 2 ) // do any 2x1 loop
-	{
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		ptrbb = bb;
-#else
-		ptrba += off*2;
-		ptrbb = bb + off*1;
-#endif
-
-		res0_0 = 0;
-		res0_1 = 0;
-
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-		temp = bk-off;
-#elif defined(LEFT)
-		temp = off+2;	// number of values in A
-#else
-		temp = off+1;	// number of values in B
-#endif
-
-		for (k=0; k<temp; k++)
-                {
-			b0 = ptrbb[0];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-
-			a1 = ptrba[1];
-			res0_1 += a1*b0;
-
-			ptrba = ptrba+2;
-			ptrbb = ptrbb+1;
-                }
-
-		res0_0 *= alpha;
-		res0_1 *= alpha;
-
-		C0[0] = res0_0;
-		C0[1] = res0_1;
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		temp = bk - off;
-#ifdef LEFT
-		temp -= 2; // number of values in A
-#else
-		temp -= 1; // number of values in B
-#endif
-		ptrba += temp*2;
-		ptrbb += temp*1;
-#endif
-
-#ifdef LEFT
-		off += 2; // number of values in A
-#endif
-
-		C0 = C0+2;
-
-	}
-
-	if ( bm & 1 ) // do any 1x1 loop
-	{
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		ptrbb = bb;
-#else
-		ptrba += off*1;
-		ptrbb = bb + off*1;
-#endif
-
-		res0_0 = 0;
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-		temp = bk-off;
-#elif defined(LEFT)
-		temp = off+1;	// number of values in A
-#else
-		temp = off+1;	// number of values in B
-#endif
-
-		for (k=0; k<temp; k++)
-                {
-			b0 = ptrbb[0];
-
-			a0 = ptrba[0];
-			res0_0 += a0*b0;
-
-			ptrba = ptrba+1;
-			ptrbb = ptrbb+1;
-                }
-
-		res0_0 *= alpha;
-
-		C0[0] = res0_0;
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-		temp = bk - off;
-#ifdef LEFT
-		temp -= 1; // number of values in A
-#else
-		temp -= 1; // number of values in B
-#endif
-		ptrba += temp*1;
-		ptrbb += temp*1;
-#endif
-
-#ifdef LEFT
-		off += 1; // number of values in A
-#endif
-
-		C0 = C0+1;
-
-	}
-
-
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-		off += 1;
-#endif
-
-        k = (bk<<0);
-        bb = bb+k;
-        C = C+ldc;
-   }
-   return 0;
-}

From 2c8d6346195d0f731d60e552f55b100203f79dd1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 2 Jul 2017 00:50:14 +0200
Subject: [PATCH 31/42] Add files via upload

---
 Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 499732d16..107dff5f2 100644
--- a/Makefile
+++ b/Makefile
@@ -25,8 +25,8 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
 
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
 
-.PHONY : all libs netlib test ctest shared install
-.NOTPARALLEL : all libs prof lapack-test install blas-test
+.PHONY : all libs netlib $(RELA) test ctest shared install
+.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
 
 all :: libs netlib $(RELA) tests shared
 	@echo
@@ -339,6 +339,7 @@ endif
 	@touch $(NETLIB_LAPACK_DIR)/make.inc
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
 	@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
+	@$(MAKE) -C relapack clean
 	@rm -f *.grd Makefile.conf_last config_last.h
 	@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
 	@echo Done.

From df2dfe65d6ba112e70ec6c1dce167298d2d5779b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 2 Jul 2017 01:46:23 +0200
Subject: [PATCH 32/42] Update Makefile

---
 relapack/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/relapack/Makefile b/relapack/Makefile
index 1e81b5423..90d37f101 100644
--- a/relapack/Makefile
+++ b/relapack/Makefile
@@ -29,7 +29,7 @@ LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm
 
 libs:	$(OBJS)
 	@echo "Building ReLAPACK library $(LIBNAME)"
-	$(AR) -r  $(TOPDIR)/$(LIBNAME) $(OBJS)
+	$(AR) -rv  $(TOPDIR)/$(LIBNAME) $(OBJS)
 	$(RANLIB) $(TOPDIR)/$(LIBNAME)
 
 %.o: %.c config.h

From fa6a920caa09e60e24acfad7ad8acdf30e8a7f14 Mon Sep 17 00:00:00 2001
From: Zhang Xianyi <xianyi@perfxlab.com>
Date: Wed, 5 Jul 2017 17:01:03 +0800
Subject: [PATCH 33/42] Link -lm or -lm_hard for Android ARMv7.

---
 Makefile.system | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 29d3efd53..bb55dd693 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -493,6 +493,14 @@ else
 CCOMMON_OPT += -mfloat-abi=hard
 FCOMMON_OPT += -mfloat-abi=hard
 endif
+
+ifeq ($(OSNAME), Android)
+ifeq ($(ARM_SOFTFP_ABI), 1)
+EXTRALIB	+= -lm
+else
+EXTRALIB	+= -Wl,-lm_hard
+endif
+endif
 endif
 
 ifeq ($(ARCH), arm64)

From 3381f2370966d61c333476136fa188567e958407 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 6 Jul 2017 10:12:00 +0200
Subject: [PATCH 34/42] Handle different object extensions in Makefile

The optimized LAPACK functions from interface use OS-dependent suffixes .o/.obj for the object files, while netlib LAPACK uses .o throughout. ReLAPACK object names have to match in order for function replacement in the growing library file to work.
---
 relapack/Makefile | 56 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 11 deletions(-)

diff --git a/relapack/Makefile b/relapack/Makefile
index 90d37f101..ddf101bd1 100644
--- a/relapack/Makefile
+++ b/relapack/Makefile
@@ -4,7 +4,38 @@ include $(TOPDIR)/Makefile.system
 
 
 SRC = $(wildcard src/*.c)
-OBJS = $(SRC:%.c=%.o)
+
+SRC1 = \
+	src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \
+	src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \
+	src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \
+	src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c
+
+SRC2 = \
+	src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
+	src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
+	src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
+	src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
+	src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
+	src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c  src/dtrsyl_rec2.c  src/ztrsyl_rec2.c \
+	src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c  \
+	src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \
+	src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c
+
+SRCX = \
+	src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
+	src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
+	src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
+	src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
+	src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
+	src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c  src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \
+	src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \
+	src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \
+	src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c
+
+OBJS1 = $(SRC1:%.c=%.$(SUFFIX))
+OBJS2 = $(SRC2:%.c=%.o)
+OBJS = $(OBJS1) $(OBJS2)
 
 TEST_SUITS = \
 	slauum dlauum clauum zlauum \
@@ -29,9 +60,12 @@ LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm
 
 libs:	$(OBJS)
 	@echo "Building ReLAPACK library $(LIBNAME)"
-	$(AR) -rv  $(TOPDIR)/$(LIBNAME) $(OBJS)
+	$(AR) -r  $(TOPDIR)/$(LIBNAME) $(OBJS)
 	$(RANLIB) $(TOPDIR)/$(LIBNAME)
 
+%.$(SUFFIX): %.c config.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
 %.o: %.c config.h
 	$(CC) $(CFLAGS) -c $< -o $@
 
@@ -45,20 +79,20 @@ test/%.pass: test/%.x
 	@echo -n $*:
 	@./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<)
 
-test/s%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
-	$(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+test/s%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
 
-test/d%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
-	$(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+test/d%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
 
-test/c%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
-	$(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+test/c%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
 
-test/z%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
-	$(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+test/z%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+	$(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
 
 
 # cleaning up
 
 clean:
-	rm -f  $(OBJS) test/util.o test/*.x
+	rm -f  $(OBJS) test/util.$(SUFFIX) test/*.x

From 49e62c0e7796ccaa773591e739628846e3d8ab06 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 6 Jul 2017 17:30:12 +0200
Subject: [PATCH 35/42] fixed syrk_thread.c taken from wernsaar

Stride calculation fix copied from https://github.com/wernsaar/OpenBLAS/commit/88900e1
---
 driver/level3/syrk_thread.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c
index 94274be72..5f40853dc 100644
--- a/driver/level3/syrk_thread.c
+++ b/driver/level3/syrk_thread.c
@@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
       if (nthreads - num_cpu > 1) {
 
 	di = (double)i;
-	width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
+	width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
 
 	if ((width <= 0) || (width > n_to - i)) width = n_to - i;
 
@@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
       if (nthreads - num_cpu > 1) {
 
 	di = (double)(arg -> n - i);
-	width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
+	width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
 
 	if ((width <= 0) || (width > n_to - i)) width = n_to - i;
 

From f02d535fdebd541b0fc21a58c8b370c744555531 Mon Sep 17 00:00:00 2001
From: Ashwin Sekhar T K <ashwin.sekhar@cavium.com>
Date: Fri, 7 Jul 2017 12:30:42 +0530
Subject: [PATCH 36/42] arm: Fix clang compilation for ARMv7

clang is not recognizing some pre-UAL VFP mnemonics like fnmacs, fnmacd,
fnmuls and fnmuld. Replaced them with equivalent UAL mnemonics which are
vmls.f32, vmls.f64, vnmul.f32 and vnmul.f64 respectively.
---
 kernel/arm/axpy_vfp.S               |  8 ++--
 kernel/arm/cgemm_kernel_2x2_vfp.S   | 16 +++----
 kernel/arm/cgemm_kernel_2x2_vfpv3.S | 16 +++----
 kernel/arm/cgemv_n_vfp.S            | 16 +++----
 kernel/arm/cgemv_t_vfp.S            | 16 +++----
 kernel/arm/ctrmm_kernel_2x2_vfp.S   | 16 +++----
 kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 16 +++----
 kernel/arm/rot_vfp.S                | 72 ++++++++++++++---------------
 kernel/arm/scal_vfp.S               | 24 +++++-----
 kernel/arm/zgemm_kernel_2x2_vfp.S   | 16 +++----
 kernel/arm/zgemm_kernel_2x2_vfpv3.S | 16 +++----
 kernel/arm/zgemv_n_vfp.S            | 16 +++----
 kernel/arm/zgemv_t_vfp.S            | 16 +++----
 kernel/arm/ztrmm_kernel_2x2_vfp.S   | 16 +++----
 kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 16 +++----
 15 files changed, 148 insertions(+), 148 deletions(-)

diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S
index 8e5334f62..4040c7da2 100644
--- a/kernel/arm/axpy_vfp.S
+++ b/kernel/arm/axpy_vfp.S
@@ -71,14 +71,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(DOUBLE)
 
 #define	FMAC_R1	fmacd
-#define FMAC_R2 fnmacd
+#define FMAC_R2 vmls.f64
 #define	FMAC_I1	fmacd
 #define FMAC_I2 fmacd
 
 #else
 
 #define	FMAC_R1	fmacs
-#define FMAC_R2 fnmacs
+#define FMAC_R2 vmls.f32
 #define	FMAC_I1	fmacs
 #define FMAC_I2 fmacs
 
@@ -90,14 +90,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define	FMAC_R1	fmacd
 #define FMAC_R2 fmacd
-#define	FMAC_I1	fnmacd
+#define	FMAC_I1	vmls.f64
 #define FMAC_I2 fmacd
 
 #else
 
 #define	FMAC_R1	fmacs
 #define FMAC_R2 fmacs
-#define	FMAC_I1	fnmacs
+#define	FMAC_I1	vmls.f32
 #define FMAC_I2 fmacs
 
 #endif
diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S
index f0517cb47..639b713cd 100644
--- a/kernel/arm/cgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfp.S
@@ -94,42 +94,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
 
-	#define	KMAC_R	fnmacs
+	#define	KMAC_R	vmls.f32
 	#define	KMAC_I	fmacs
 
 	#define	FMAC_R1	fmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs
 
 #elif defined(CN) || defined(CT)
 
 	#define	KMAC_R	fmacs
-	#define	KMAC_I	fnmacs
+	#define	KMAC_I	vmls.f32
 
 	#define	FMAC_R1	fmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs
 
 #elif defined(NC) || defined(TC)
 
 	#define	KMAC_R	fmacs
-	#define	KMAC_I	fnmacs
+	#define	KMAC_I	vmls.f32
 
 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
+	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs
 
 #else
 
-	#define	KMAC_R  fnmacs
+	#define	KMAC_R  vmls.f32
 	#define	KMAC_I	fmacs
 
 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
+	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs
 
 #endif
diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
index cf132a184..16c00ad73 100644
--- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
@@ -106,10 +106,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubs
 	#define	FADD_I	fadds
 
-	#define	FMAC_R1	fnmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R1	vmls.f32
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
-	#define	FMAC_I2	fnmacs
+	#define	FMAC_I2	vmls.f32
 
 #elif defined(CN) || defined(CT)
 
@@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
+	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs
 
 #elif defined(NC) || defined(TC)
@@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubs
 
 	#define	FMAC_R1	fmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs
 
@@ -136,10 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubs
 	#define	FADD_I	fadds
 
-	#define	FMAC_R1	fnmacs
+	#define	FMAC_R1	vmls.f32
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
-	#define	FMAC_I2	fnmacs
+	#define	FMAC_I1	vmls.f32
+	#define	FMAC_I2	vmls.f32
 
 #endif
 
diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S
index 5d2748644..a9040e76e 100644
--- a/kernel/arm/cgemv_n_vfp.S
+++ b/kernel/arm/cgemv_n_vfp.S
@@ -78,42 +78,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(CONJ) && !defined(XCONJ)
 
-        #define KMAC_R  fnmacs
+        #define KMAC_R  vmls.f32
         #define KMAC_I  fmacs
 
         #define FMAC_R1 fmacs
-        #define FMAC_R2 fnmacs
+        #define FMAC_R2 vmls.f32
         #define FMAC_I1 fmacs
         #define FMAC_I2 fmacs
 
 #elif defined(CONJ) && !defined(XCONJ)
 
         #define KMAC_R  fmacs
-        #define KMAC_I  fnmacs
+        #define KMAC_I  vmls.f32
 
         #define FMAC_R1 fmacs
-        #define FMAC_R2 fnmacs
+        #define FMAC_R2 vmls.f32
         #define FMAC_I1 fmacs
         #define FMAC_I2 fmacs
 
 #elif !defined(CONJ) && defined(XCONJ)
 
         #define KMAC_R  fmacs
-        #define KMAC_I  fnmacs
+        #define KMAC_I  vmls.f32
 
         #define FMAC_R1 fmacs
         #define FMAC_R2 fmacs
-        #define FMAC_I1 fnmacs
+        #define FMAC_I1 vmls.f32
         #define FMAC_I2 fmacs
 
 #else
 
-        #define KMAC_R  fnmacs
+        #define KMAC_R  vmls.f32
         #define KMAC_I  fmacs
 
         #define FMAC_R1 fmacs
         #define FMAC_R2 fmacs
-        #define FMAC_I1 fnmacs
+        #define FMAC_I1 vmls.f32
         #define FMAC_I2 fmacs
 
 #endif
diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S
index 76c8a8f18..56451c5df 100644
--- a/kernel/arm/cgemv_t_vfp.S
+++ b/kernel/arm/cgemv_t_vfp.S
@@ -76,42 +76,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(CONJ) && !defined(XCONJ)
 
-        #define KMAC_R  fnmacs
+        #define KMAC_R  vmls.f32
         #define KMAC_I  fmacs
 
         #define FMAC_R1 fmacs
-        #define FMAC_R2 fnmacs
+        #define FMAC_R2 vmls.f32
         #define FMAC_I1 fmacs
         #define FMAC_I2 fmacs
 
 #elif defined(CONJ) && !defined(XCONJ)
 
         #define KMAC_R  fmacs
-        #define KMAC_I  fnmacs
+        #define KMAC_I  vmls.f32
 
         #define FMAC_R1 fmacs
-        #define FMAC_R2 fnmacs
+        #define FMAC_R2 vmls.f32
         #define FMAC_I1 fmacs
         #define FMAC_I2 fmacs
 
 #elif !defined(CONJ) && defined(XCONJ)
 
         #define KMAC_R  fmacs
-        #define KMAC_I  fnmacs
+        #define KMAC_I  vmls.f32
 
         #define FMAC_R1 fmacs
         #define FMAC_R2 fmacs
-        #define FMAC_I1 fnmacs
+        #define FMAC_I1 vmls.f32
         #define FMAC_I2 fmacs
 
 #else
 
-        #define KMAC_R  fnmacs
+        #define KMAC_R  vmls.f32
         #define KMAC_I  fmacs
 
         #define FMAC_R1 fmacs
         #define FMAC_R2 fmacs
-        #define FMAC_I1 fnmacs
+        #define FMAC_I1 vmls.f32
         #define FMAC_I2 fmacs
 
 #endif
diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S
index 8cb7ede9d..50798449b 100644
--- a/kernel/arm/ctrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S
@@ -98,42 +98,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
 
-	#define	KMAC_R	fnmacs
+	#define	KMAC_R	vmls.f32
 	#define	KMAC_I	fmacs
 
 	#define	FMAC_R1	fmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs
 
 #elif defined(CN) || defined(CT)
 
 	#define	KMAC_R	fmacs
-	#define	KMAC_I	fnmacs
+	#define	KMAC_I	vmls.f32
 
 	#define	FMAC_R1	fmacs
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs
 
 #elif defined(NC) || defined(TC)
 
 	#define	KMAC_R	fmacs
-	#define	KMAC_I	fnmacs
+	#define	KMAC_I	vmls.f32
 
 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
+	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs
 
 #else
 
-	#define	KMAC_R  fnmacs
+	#define	KMAC_R  vmls.f32
 	#define	KMAC_I	fmacs
 
 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmacs
+	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs
 
 #endif
diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
index 97bd88c69..ef7e58fa4 100644
--- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
@@ -93,10 +93,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubs
 	#define	FADD_I	fadds
 
-	#define	FMAC_R1	fnmuls
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R1	vnmul.f32
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmuls
-	#define	FMAC_I2	fnmacs
+	#define	FMAC_I2	vmls.f32
 
 #elif defined(CN) || defined(CT)
 
@@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	#define	FMAC_R1	fmuls
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmuls
+	#define	FMAC_I1	vnmul.f32
 	#define	FMAC_I2	fmacs
 
 #elif defined(NC) || defined(TC)
@@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubs
 
 	#define	FMAC_R1	fmuls
-	#define	FMAC_R2	fnmacs
+	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmuls
 	#define	FMAC_I2	fmacs
 
@@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubs
 	#define	FADD_I	fadds
 
-	#define	FMAC_R1	fnmuls
+	#define	FMAC_R1	vnmul.f32
 	#define	FMAC_R2	fmacs
-	#define	FMAC_I1	fnmuls
-	#define	FMAC_I2	fnmacs
+	#define	FMAC_I1	vnmul.f32
+	#define	FMAC_I2	vmls.f32
 
 #endif
 
diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S
index d053423b6..0d1067cf9 100644
--- a/kernel/arm/rot_vfp.S
+++ b/kernel/arm/rot_vfp.S
@@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 
@@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 
@@ -100,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 
@@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 
@@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X, { d2 }
 	fstmiad	Y, { d3 }
 
@@ -145,7 +145,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 
@@ -154,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 
@@ -163,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 
@@ -172,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 
@@ -186,7 +186,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 
@@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X, { s2 }
 	fstmias	Y, { s3 }
 
@@ -226,13 +226,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 
@@ -241,13 +241,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 
@@ -259,13 +259,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 
@@ -274,13 +274,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 
@@ -294,13 +294,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 
@@ -314,13 +314,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
-	fnmacd      d3 , d1, d4
+	vmls.f64    d3 , d1, d4
 	vstr	    d2 , [ X, #0 ]
 	vstr	    d3 , [ Y, #0 ]
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
-	fnmacd      d3 , d1, d5
+	vmls.f64    d3 , d1, d5
 	vstr	    d2 , [ X, #8 ]
 	vstr	    d3 , [ Y, #8 ]
 
@@ -343,13 +343,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 
@@ -358,13 +358,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 
@@ -376,13 +376,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 
@@ -391,13 +391,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 
@@ -411,13 +411,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 
@@ -431,13 +431,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
-	fnmacs      s3 , s1, s4
+	vmls.f32    s3 , s1, s4
 	vstr	    s2 , [ X, #0 ]
 	vstr	    s3 , [ Y, #0 ]
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
-	fnmacs      s3 , s1, s5
+	vmls.f32    s3 , s1, s5
 	vstr	    s2 , [ X, #4 ]
 	vstr	    s3 , [ Y, #4 ]
 
diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S
index a8939c3a2..cc3e3b98d 100644
--- a/kernel/arm/scal_vfp.S
+++ b/kernel/arm/scal_vfp.S
@@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
 
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
@@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
 
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
@@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
@@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
-	fnmacd     d2, d1, d5
+	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X, { d2 - d3 }
@@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
 
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
 
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
 
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
@@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
@@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
-	fnmacs     s2, s1, s5
+	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X, { s2 - s3 }
diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S
index 46507c4d2..6aeb6c790 100644
--- a/kernel/arm/zgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfp.S
@@ -87,42 +87,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
 
-	#define	KMAC_R	fnmacd
+	#define	KMAC_R	vmls.f64
 	#define	KMAC_I	fmacd
 
 	#define	FMAC_R1	fmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd
 
 #elif defined(CN) || defined(CT)
 
 	#define	KMAC_R	fmacd
-	#define	KMAC_I	fnmacd
+	#define	KMAC_I	vmls.f64
 
 	#define	FMAC_R1	fmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd
 
 #elif defined(NC) || defined(TC)
 
 	#define	KMAC_R	fmacd
-	#define	KMAC_I	fnmacd
+	#define	KMAC_I	vmls.f64
 
 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
+	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd
 
 #else
 
-	#define	KMAC_R  fnmacd
+	#define	KMAC_R  vmls.f64
 	#define	KMAC_I	fmacd
 
 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
+	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd
 
 #endif
diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
index 5a99f792f..10c83e356 100644
--- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
@@ -106,10 +106,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubd
 	#define	FADD_I	faddd
 
-	#define	FMAC_R1	fnmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R1	vmls.f64
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
-	#define	FMAC_I2	fnmacd
+	#define	FMAC_I2	vmls.f64
 
 #elif defined(CN) || defined(CT)
 
@@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
+	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd
 
 #elif defined(NC) || defined(TC)
@@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubd
 
 	#define	FMAC_R1	fmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd
 
@@ -136,10 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubd
 	#define	FADD_I	faddd
 
-	#define	FMAC_R1	fnmacd
+	#define	FMAC_R1	vmls.f64
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
-	#define	FMAC_I2	fnmacd
+	#define	FMAC_I1	vmls.f64
+	#define	FMAC_I2	vmls.f64
 
 #endif
 
diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S
index da9a91043..cba59567d 100644
--- a/kernel/arm/zgemv_n_vfp.S
+++ b/kernel/arm/zgemv_n_vfp.S
@@ -79,42 +79,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(CONJ) && !defined(XCONJ)
 
-        #define KMAC_R  fnmacd
+        #define KMAC_R  vmls.f64
         #define KMAC_I  fmacd
 
         #define FMAC_R1 fmacd
-        #define FMAC_R2 fnmacd
+        #define FMAC_R2 vmls.f64
         #define FMAC_I1 fmacd
         #define FMAC_I2 fmacd
 
 #elif defined(CONJ) && !defined(XCONJ)
 
         #define KMAC_R  fmacd
-        #define KMAC_I  fnmacd
+        #define KMAC_I  vmls.f64
 
         #define FMAC_R1 fmacd
-        #define FMAC_R2 fnmacd
+        #define FMAC_R2 vmls.f64
         #define FMAC_I1 fmacd
         #define FMAC_I2 fmacd
 
 #elif !defined(CONJ) && defined(XCONJ)
 
         #define KMAC_R  fmacd
-        #define KMAC_I  fnmacd
+        #define KMAC_I  vmls.f64
 
         #define FMAC_R1 fmacd
         #define FMAC_R2 fmacd
-        #define FMAC_I1 fnmacd
+        #define FMAC_I1 vmls.f64
         #define FMAC_I2 fmacd
 
 #else
 
-        #define KMAC_R  fnmacd
+        #define KMAC_R  vmls.f64
         #define KMAC_I  fmacd
 
         #define FMAC_R1 fmacd
         #define FMAC_R2 fmacd
-        #define FMAC_I1 fnmacd
+        #define FMAC_I1 vmls.f64
         #define FMAC_I2 fmacd
 
 #endif
diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S
index 211fa0701..b11c08086 100644
--- a/kernel/arm/zgemv_t_vfp.S
+++ b/kernel/arm/zgemv_t_vfp.S
@@ -77,42 +77,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(CONJ) && !defined(XCONJ)
 
-        #define KMAC_R  fnmacd
+        #define KMAC_R  vmls.f64
         #define KMAC_I  fmacd
 
         #define FMAC_R1 fmacd
-        #define FMAC_R2 fnmacd
+        #define FMAC_R2 vmls.f64
         #define FMAC_I1 fmacd
         #define FMAC_I2 fmacd
 
 #elif defined(CONJ) && !defined(XCONJ)
 
         #define KMAC_R  fmacd
-        #define KMAC_I  fnmacd
+        #define KMAC_I  vmls.f64
 
         #define FMAC_R1 fmacd
-        #define FMAC_R2 fnmacd
+        #define FMAC_R2 vmls.f64
         #define FMAC_I1 fmacd
         #define FMAC_I2 fmacd
 
 #elif !defined(CONJ) && defined(XCONJ)
 
         #define KMAC_R  fmacd
-        #define KMAC_I  fnmacd
+        #define KMAC_I  vmls.f64
 
         #define FMAC_R1 fmacd
         #define FMAC_R2 fmacd
-        #define FMAC_I1 fnmacd
+        #define FMAC_I1 vmls.f64
         #define FMAC_I2 fmacd
 
 #else
 
-        #define KMAC_R  fnmacd
+        #define KMAC_R  vmls.f64
         #define KMAC_I  fmacd
 
         #define FMAC_R1 fmacd
         #define FMAC_R2 fmacd
-        #define FMAC_I1 fnmacd
+        #define FMAC_I1 vmls.f64
         #define FMAC_I2 fmacd
 
 #endif
diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S
index dc80b17b8..f412dfcfa 100644
--- a/kernel/arm/ztrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S
@@ -96,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
 
-	#define	KMAC_R	fnmacd
+	#define	KMAC_R	vmls.f64
 	#define	KMAC_I	fmacd
 
 	#define	FMAC_R1	fmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd
 
 #elif defined(CN) || defined(CT)
 
 	#define	KMAC_R	fmacd
-	#define	KMAC_I	fnmacd
+	#define	KMAC_I	vmls.f64
 
 	#define	FMAC_R1	fmacd
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd
 
 #elif defined(NC) || defined(TC)
 
 	#define	KMAC_R	fmacd
-	#define	KMAC_I	fnmacd
+	#define	KMAC_I	vmls.f64
 
 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
+	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd
 
 #else
 
-	#define	KMAC_R  fnmacd
+	#define	KMAC_R  vmls.f64
 	#define	KMAC_I	fmacd
 
 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmacd
+	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd
 
 #endif
diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
index 5a808ccbc..92370bbc1 100644
--- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
@@ -93,10 +93,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubd
 	#define	FADD_I	faddd
 
-	#define	FMAC_R1	fnmuld
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R1	vnmul.f64
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmuld
-	#define	FMAC_I2	fnmacd
+	#define	FMAC_I2	vmls.f64
 
 #elif defined(CN) || defined(CT)
 
@@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	#define	FMAC_R1	fmuld
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmuld
+	#define	FMAC_I1	vnmul.f64
 	#define	FMAC_I2	fmacd
 
 #elif defined(NC) || defined(TC)
@@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubd
 
 	#define	FMAC_R1	fmuld
-	#define	FMAC_R2	fnmacd
+	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmuld
 	#define	FMAC_I2	fmacd
 
@@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubd
 	#define	FADD_I	faddd
 
-	#define	FMAC_R1	fnmuld
+	#define	FMAC_R1	vnmul.f64
 	#define	FMAC_R2	fmacd
-	#define	FMAC_I1	fnmuld
-	#define	FMAC_I2	fnmacd
+	#define	FMAC_I1	vnmul.f64
+	#define	FMAC_I2	vmls.f64
 
 #endif
 

From c1cf62d2c030b65d8fbf37b19ca1d88fa38f7709 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 9 Jul 2017 09:45:38 +0200
Subject: [PATCH 37/42] Add sched_getcpu implementation for pre-2.6 glibc

Fixes #1210, compilation on RHEL5 with affinity enabled
---
 driver/others/init.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/driver/others/init.c b/driver/others/init.c
index 9be6f52b0..3e6176967 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -354,6 +354,24 @@ static int numa_check(void) {
   return common -> num_nodes;
 }
 
+#if defined(__GLIBC_PREREQ)
+#if !__GLIBC_PREREQ(2, 6)
+int sched_getcpu(void)
+{
+int cpu;
+FILE *fp = NULL;
+if ( (fp = fopen("/proc/self/stat", "r")) == NULL)
+   return -1;
+if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) {
+  fclose (fp);
+  return -1;
+  }
+  fclose (fp);
+  return(cpu);
+}
+#endif
+#endif
+
 static void numa_mapping(void) {
 
   int node, cpu, core;
@@ -808,7 +826,6 @@ void gotoblas_affinity_init(void) {
   common -> shmid = pshmid;
 
   if (common -> magic != SH_MAGIC) {
-
 #ifdef DEBUG
     fprintf(stderr, "Shared Memory Initialization.\n");
 #endif
@@ -830,7 +847,7 @@ void gotoblas_affinity_init(void) {
     if (common -> num_nodes > 1) numa_mapping();
 
     common -> final_num_procs = 0;
-    for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1;   //Make the max cpu number.
+    for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1;   //Make the max cpu number. 
 
     for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] =  0;
 

From ad2462811a4093153ea9898200ab73ef4aea6f23 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 9 Jul 2017 13:15:24 +0200
Subject: [PATCH 38/42] Do not add -lpthread on Android builds (#1229)

* Do not add -lpthread on Android builds

* Do not add -lpthread on Android cmake builds
---
 cmake/os.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/os.cmake b/cmake/os.cmake
index f5a75027c..e9df68d7f 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -77,7 +77,7 @@ if (CYGWIN)
   set(NO_EXPRECISION 1)
 endif ()
 
-if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
+if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
   if (SMP)
     set(EXTRALIB "${EXTRALIB} -lpthread")
   endif ()

From 4a012c3d208f7e2a1df9303a50c884970217a259 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 11 Jul 2017 15:39:15 +0200
Subject: [PATCH 39/42] Fix unintentional fall-through cases in get_cacheinfo

These appear to be unintended side effects of PR #1091, probably causing #1232
---
 cpuid_x86.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index ab2ecdcaf..3733ea3ac 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -637,12 +637,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
 	LD1.linesize    = 64;
 	break;
       case 0x63 :
-  DTB.size        = 2048;
-  DTB.associative = 4;
-  DTB.linesize    = 32;
-  LDTB.size       = 4096;
-  LDTB.associative= 4;
-  LDTB.linesize   = 32;
+  	DTB.size        = 2048;
+  	DTB.associative = 4;
+  	DTB.linesize    = 32;
+  	LDTB.size       = 4096;
+  	LDTB.associative= 4;
+  	LDTB.linesize   = 32;
+	break;
       case 0x66 :
 	LD1.size        = 8;
 	LD1.associative = 4;
@@ -675,12 +676,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
 	LC1.associative = 8;
 	break;
       case 0x76 :
-  ITB.size        = 2048;
-  ITB.associative = 0;
-  ITB.linesize    = 8;
-  LITB.size       = 4096;
-  LITB.associative= 0;
-  LITB.linesize   = 8;
+  	ITB.size        = 2048;
+  	ITB.associative = 0;
+  	ITB.linesize    = 8;
+  	LITB.size       = 4096;
+  	LITB.associative= 0;
+  	LITB.linesize   = 8;
+	break;
       case 0x77 :
 	LC1.size        = 16;
 	LC1.associative = 4;

From 529bfc36ec444223ba7b49717ef2d7fa12445159 Mon Sep 17 00:00:00 2001
From: Andrew <bradatajs@yahoo.com>
Date: Wed, 12 Jul 2017 00:59:30 +0200
Subject: [PATCH 40/42] Fix write past fixed size buffer

---
 driver/level2/gbmv_thread.c | 2 +-
 driver/level2/sbmv_thread.c | 2 +-
 driver/level2/spmv_thread.c | 2 +-
 driver/level2/tbmv_thread.c | 2 +-
 driver/level2/tpmv_thread.c | 2 +-
 driver/level2/trmv_thread.c | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c
index ef9d58d76..e86b565f8 100644
--- a/driver/level2/gbmv_thread.c
+++ b/driver/level2/gbmv_thread.c
@@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
 
   blas_arg_t args;
   blas_queue_t queue[MAX_CPU_NUMBER];
-  BLASLONG range_m[MAX_CPU_NUMBER];
+  BLASLONG range_m[MAX_CPU_NUMBER + 1];
   BLASLONG range_n[MAX_CPU_NUMBER + 1];
 
   BLASLONG width, i, num_cpu;
diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c
index a0377d638..5718c0ec9 100644
--- a/driver/level2/sbmv_thread.c
+++ b/driver/level2/sbmv_thread.c
@@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
 #endif
 
   blas_arg_t args;
-  blas_queue_t queue[MAX_CPU_NUMBER];
+  blas_queue_t queue[MAX_CPU_NUMBER + 1];
   BLASLONG range_m[MAX_CPU_NUMBER + 1];
   BLASLONG range_n[MAX_CPU_NUMBER];
 
diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c
index f8ae3cdcd..035300841 100644
--- a/driver/level2/spmv_thread.c
+++ b/driver/level2/spmv_thread.c
@@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
   blas_arg_t args;
   blas_queue_t queue[MAX_CPU_NUMBER];
   BLASLONG range_m[MAX_CPU_NUMBER + 1];
-  BLASLONG range_n[MAX_CPU_NUMBER];
+  BLASLONG range_n[MAX_CPU_NUMBER + 1];
 
   BLASLONG width, i, num_cpu;
 
diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c
index bbb1c50eb..226a922e9 100644
--- a/driver/level2/tbmv_thread.c
+++ b/driver/level2/tbmv_thread.c
@@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
   blas_arg_t args;
   blas_queue_t queue[MAX_CPU_NUMBER];
   BLASLONG range_m[MAX_CPU_NUMBER + 1];
-  BLASLONG range_n[MAX_CPU_NUMBER];
+  BLASLONG range_n[MAX_CPU_NUMBER + 1];
 
   BLASLONG width, i, num_cpu;
 
diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c
index 47dc1daf9..c91b52775 100644
--- a/driver/level2/tpmv_thread.c
+++ b/driver/level2/tpmv_thread.c
@@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
   blas_arg_t args;
   blas_queue_t queue[MAX_CPU_NUMBER];
   BLASLONG range_m[MAX_CPU_NUMBER + 1];
-  BLASLONG range_n[MAX_CPU_NUMBER];
+  BLASLONG range_n[MAX_CPU_NUMBER + 1];
 
   BLASLONG width, i, num_cpu;
 
diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c
index 42edb83cb..0a155366c 100644
--- a/driver/level2/trmv_thread.c
+++ b/driver/level2/trmv_thread.c
@@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
   blas_arg_t args;
   blas_queue_t queue[MAX_CPU_NUMBER];
   BLASLONG range_m[MAX_CPU_NUMBER + 1];
-  BLASLONG range_n[MAX_CPU_NUMBER];
+  BLASLONG range_n[MAX_CPU_NUMBER + 1];
 
   BLASLONG width, i, num_cpu;
 

From d33fc32cf30cf1262030c93fa44c72ca8ab27681 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 12 Jul 2017 09:35:11 +0200
Subject: [PATCH 41/42] Revert "Fix unintentional fall-through cases in
 get_cacheinfo"

---
 cpuid_x86.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 3733ea3ac..ab2ecdcaf 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -637,13 +637,12 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
 	LD1.linesize    = 64;
 	break;
       case 0x63 :
-  	DTB.size        = 2048;
-  	DTB.associative = 4;
-  	DTB.linesize    = 32;
-  	LDTB.size       = 4096;
-  	LDTB.associative= 4;
-  	LDTB.linesize   = 32;
-	break;
+  DTB.size        = 2048;
+  DTB.associative = 4;
+  DTB.linesize    = 32;
+  LDTB.size       = 4096;
+  LDTB.associative= 4;
+  LDTB.linesize   = 32;
       case 0x66 :
 	LD1.size        = 8;
 	LD1.associative = 4;
@@ -676,13 +675,12 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
 	LC1.associative = 8;
 	break;
       case 0x76 :
-  	ITB.size        = 2048;
-  	ITB.associative = 0;
-  	ITB.linesize    = 8;
-  	LITB.size       = 4096;
-  	LITB.associative= 0;
-  	LITB.linesize   = 8;
-	break;
+  ITB.size        = 2048;
+  ITB.associative = 0;
+  ITB.linesize    = 8;
+  LITB.size       = 4096;
+  LITB.associative= 0;
+  LITB.linesize   = 8;
       case 0x77 :
 	LC1.size        = 16;
 	LC1.associative = 4;

From 31e086d6a66658e7b04390e69884fe569e0a1e9d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Jul 2017 22:01:47 +0200
Subject: [PATCH 42/42] Disable ReLAPACK by default (#1238)

* Disable ReLAPACK by default; mention it in final build message if included

* Add files via upload

* Add files via upload

* Add files via upload
---
 Makefile        | 2 +-
 Makefile.rule   | 2 +-
 Makefile.system | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 107dff5f2..1b9bcb118 100644
--- a/Makefile
+++ b/Makefile
@@ -17,7 +17,7 @@ SUBDIRS	+= lapack
 endif
 
 RELA =
-ifneq ($(BUILD_RELAPACK), 0)
+ifeq ($(BUILD_RELAPACK), 1)
 RELA = re_lapack
 endif
 
diff --git a/Makefile.rule b/Makefile.rule
index 2866699be..8d8aecdc9 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -84,7 +84,7 @@ VERSION = 0.2.20.dev
 BUILD_LAPACK_DEPRECATED = 1
 
 # Build RecursiveLAPACK on top of LAPACK
-BUILD_RELAPACK = 1
+# BUILD_RELAPACK = 1
 
 # If you want to use legacy threaded Level 3 implementation.
 # USE_SIMPLE_THREADED_LEVEL3 = 1
diff --git a/Makefile.system b/Makefile.system
index c4cf619d0..bd361a1a2 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1129,6 +1129,9 @@ LIB_COMPONENTS += LAPACK
 ifneq ($(NO_LAPACKE), 1)
 LIB_COMPONENTS += LAPACKE
 endif
+ifeq ($(BUILD_RELAPACK), 1)
+LIB_COMPONENTS += ReLAPACK
+endif
 endif
 
 ifeq ($(ONLY_CBLAS), 1)