From 96dd0ef4f71da18324864e979133e873aa66306a Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 14:25:10 +1000 Subject: [PATCH 01/42] Optimise ccopy for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/ccopy_microk_power8.c | 128 ++++++++++++++--------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c index b2b1bead1..613c4d286 100644 --- a/kernel/power/ccopy_microk_power8.c +++ b/kernel/power/ccopy_microk_power8.c @@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y) { __asm__ ( - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %5, %2 \n\t" - "lxvw4x 34, %6, %2 \n\t" - "lxvw4x 35, %7, %2 \n\t" - "lxvw4x 36, %8, %2 \n\t" - "lxvw4x 37, %9, %2 \n\t" - "lxvw4x 38, %10, %2 \n\t" - "lxvw4x 39, %11, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" "addi %2, %2, 128 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %5, %2 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "lxvw4x 34, %6, %2 \n\t" - "lxvw4x 35, %7, %2 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "lxvw4x 36, %8, %2 \n\t" - "lxvw4x 37, %9, %2 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" - "lxvw4x 38, %10, %2 \n\t" - "lxvw4x 39, %11, %2 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" @@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y) "2: \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : From be55f96cbdc919c7c7da2da2f7a2c6c47336a9f6 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 14:58:00 +1000 Subject: [PATCH 02/42] Optimise scopy for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/scopy_microk_power8.c | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c index 444a6d4d5..7a54d5e1e 100644 --- a/kernel/power/scopy_microk_power8.c +++ b/kernel/power/scopy_microk_power8.c @@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) { __asm__ ( - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" @@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) "2: \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : From 6f4eca5ea4ab00726199277bb7a079900d20d388 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:23:20 +1000 Subject: [PATCH 03/42] Optimise sswap for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/sswap_microk_power8.c | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c index d44f16765..cfefdd6ef 100644 --- a/kernel/power/sswap_microk_power8.c +++ b/kernel/power/sswap_microk_power8.c @@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "lxvw4x 32, 0, %4 \n\t" - "lxvw4x 33, %5, %4 \n\t" - "lxvw4x 34, %6, %4 \n\t" - "lxvw4x 35, %7, %4 \n\t" - "lxvw4x 36, %8, %4 \n\t" - "lxvw4x 37, %9, %4 \n\t" - "lxvw4x 38, %10, %4 \n\t" - "lxvw4x 39, %11, %4 \n\t" + "lxvd2x 32, 0, %4 \n\t" + "lxvd2x 33, %5, %4 \n\t" + "lxvd2x 34, %6, %4 \n\t" + "lxvd2x 35, %7, %4 \n\t" + "lxvd2x 36, %8, %4 \n\t" + "lxvd2x 37, %9, %4 \n\t" + "lxvd2x 38, %10, %4 \n\t" + "lxvd2x 39, %11, %4 \n\t" - "lxvw4x 40, 0, %3 \n\t" - "lxvw4x 41, %5, %3 \n\t" - "lxvw4x 42, %6, %3 \n\t" - "lxvw4x 43, %7, %3 \n\t" - "lxvw4x 44, %8, %3 \n\t" - "lxvw4x 45, %9, %3 \n\t" - "lxvw4x 46, %10, %3 \n\t" - "lxvw4x 47, %11, %3 \n\t" + "lxvd2x 40, 0, %3 \n\t" + "lxvd2x 41, %5, %3 \n\t" + "lxvd2x 42, %6, %3 \n\t" + "lxvd2x 43, %7, %3 \n\t" + "lxvd2x 44, %8, %3 \n\t" + "lxvd2x 45, %9, %3 \n\t" + "lxvd2x 46, %10, %3 \n\t" + "lxvd2x 47, %11, %3 \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %4 \n\t" - "stxvw4x 41, %5, %4 \n\t" - "stxvw4x 42, %6, %4 \n\t" - "stxvw4x 43, %7, %4 \n\t" - "stxvw4x 44, %8, %4 \n\t" - "stxvw4x 45, %9, %4 \n\t" - "stxvw4x 46, %10, %4 \n\t" - "stxvw4x 47, %11, %4 \n\t" + "stxvd2x 40, 0, %4 \n\t" + "stxvd2x 41, %5, %4 \n\t" + "stxvd2x 42, %6, %4 \n\t" + "stxvd2x 43, %7, %4 \n\t" + "stxvd2x 44, %8, %4 \n\t" + "stxvd2x 45, %9, %4 \n\t" + "stxvd2x 46, %10, %4 \n\t" + "stxvd2x 47, %11, %4 \n\t" "addi %4, %4, 128 \n\t" From 4f09030fdc36444709cf3af9041a8043f1f6d83d Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:36:10 +1000 Subject: [PATCH 04/42] Optimise cswap for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/cswap_microk_power8.c | 128 ++++++++++++++--------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c index 1dd03dc88..8d7d0c0b9 100644 --- a/kernel/power/cswap_microk_power8.c +++ b/kernel/power/cswap_microk_power8.c @@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "lxvw4x 32, 0, %4 \n\t" - "lxvw4x 33, %5, %4 \n\t" - "lxvw4x 34, %6, %4 \n\t" - "lxvw4x 35, %7, %4 \n\t" - "lxvw4x 36, %8, %4 \n\t" - "lxvw4x 37, %9, %4 \n\t" - "lxvw4x 38, %10, %4 \n\t" - "lxvw4x 39, %11, %4 \n\t" + "lxvd2x 32, 0, %4 \n\t" + "lxvd2x 33, %5, %4 \n\t" + "lxvd2x 34, %6, %4 \n\t" + "lxvd2x 35, %7, %4 \n\t" + "lxvd2x 36, %8, %4 \n\t" + "lxvd2x 37, %9, %4 \n\t" + "lxvd2x 38, %10, %4 \n\t" + "lxvd2x 39, %11, %4 \n\t" "addi %4, %4, 128 \n\t" - "lxvw4x 40, 0, %4 \n\t" - "lxvw4x 41, %5, %4 \n\t" - "lxvw4x 42, %6, %4 \n\t" - "lxvw4x 43, %7, %4 \n\t" - "lxvw4x 44, %8, %4 \n\t" - "lxvw4x 45, %9, %4 \n\t" - "lxvw4x 46, %10, %4 \n\t" - "lxvw4x 47, %11, %4 \n\t" + "lxvd2x 40, 0, %4 \n\t" + "lxvd2x 41, %5, %4 \n\t" + "lxvd2x 42, %6, %4 \n\t" + "lxvd2x 43, %7, %4 \n\t" + "lxvd2x 44, %8, %4 \n\t" + "lxvd2x 45, %9, %4 \n\t" + "lxvd2x 46, %10, %4 \n\t" + "lxvd2x 47, %11, %4 \n\t" "addi %4, %4, -128 \n\t" - "lxvw4x 48, 0, %3 \n\t" - "lxvw4x 49, %5, %3 \n\t" - "lxvw4x 50, %6, %3 \n\t" - "lxvw4x 51, %7, %3 \n\t" - "lxvw4x 0, %8, %3 \n\t" - "lxvw4x 1, %9, %3 \n\t" - "lxvw4x 2, %10, %3 \n\t" - "lxvw4x 3, %11, %3 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "lxvd2x 51, %7, %3 \n\t" + "lxvd2x 0, %8, %3 \n\t" + "lxvd2x 1, %9, %3 \n\t" + "lxvd2x 2, %10, %3 \n\t" + "lxvd2x 3, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "lxvw4x 4, 0, %3 \n\t" - "lxvw4x 5, %5, %3 \n\t" - "lxvw4x 6, %6, %3 \n\t" - "lxvw4x 7, %7, %3 \n\t" - "lxvw4x 8, %8, %3 \n\t" - "lxvw4x 9, %9, %3 \n\t" - "lxvw4x 10, %10, %3 \n\t" - "lxvw4x 11, %11, %3 \n\t" + "lxvd2x 4, 0, %3 \n\t" + "lxvd2x 5, %5, %3 \n\t" + "lxvd2x 6, %6, %3 \n\t" + "lxvd2x 7, %7, %3 \n\t" + "lxvd2x 8, %8, %3 \n\t" + "lxvd2x 9, %9, %3 \n\t" + "lxvd2x 10, %10, %3 \n\t" + "lxvd2x 11, %11, %3 \n\t" "addi %3, %3, -128 \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 48, 0, %4 \n\t" - "stxvw4x 49, %5, %4 \n\t" - "stxvw4x 50, %6, %4 \n\t" - "stxvw4x 51, %7, %4 \n\t" - "stxvw4x 0, %8, %4 \n\t" - "stxvw4x 1, %9, %4 \n\t" - "stxvw4x 2, %10, %4 \n\t" - "stxvw4x 3, %11, %4 \n\t" + "stxvd2x 48, 0, %4 \n\t" + "stxvd2x 49, %5, %4 \n\t" + "stxvd2x 50, %6, %4 \n\t" + "stxvd2x 51, %7, %4 \n\t" + "stxvd2x 0, %8, %4 \n\t" + "stxvd2x 1, %9, %4 \n\t" + "stxvd2x 2, %10, %4 \n\t" + "stxvd2x 3, %11, %4 \n\t" "addi %4, %4, 128 \n\t" - "stxvw4x 4, 0, %4 \n\t" - "stxvw4x 5, %5, %4 \n\t" - "stxvw4x 6, %6, %4 \n\t" - "stxvw4x 7, %7, %4 \n\t" - "stxvw4x 8, %8, %4 \n\t" - "stxvw4x 9, %9, %4 \n\t" - "stxvw4x 10, %10, %4 \n\t" - "stxvw4x 11, %11, %4 \n\t" + "stxvd2x 4, 0, %4 \n\t" + "stxvd2x 5, %5, %4 \n\t" + "stxvd2x 6, %6, %4 \n\t" + "stxvd2x 7, %7, %4 \n\t" + "stxvd2x 8, %8, %4 \n\t" + "stxvd2x 9, %9, %4 \n\t" + "stxvd2x 10, %10, %4 \n\t" + "stxvd2x 11, %11, %4 \n\t" "addi %4, %4, 128 \n\t" From 19bdf9d52b222a4edd3e1710023af8c40f84c255 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:38:32 +1000 Subject: [PATCH 05/42] Optimise casum for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/casum_microk_power8.c | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c index 93ba50660..7d12c9885 100644 --- a/kernel/power/casum_microk_power8.c +++ b/kernel/power/casum_microk_power8.c @@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x) "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" From 32c7fe6bff6f04d61e6a09d10199a14e63e77083 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:39:27 +1000 Subject: [PATCH 06/42] Optimise sasum for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/sasum_microk_power8.c | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c index 08a766f80..4bb515de8 100644 --- a/kernel/power/sasum_microk_power8.c +++ b/kernel/power/sasum_microk_power8.c @@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x) "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" From e0034de22d9a789988e29e3b67a796cee0c97965 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:43:31 +1000 Subject: [PATCH 07/42] Optimise sdot for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/sdot_microk_power8.c | 64 +++++++++++++++---------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c index 7f7ccfac3..bfe100c8b 100644 --- a/kernel/power/sdot_microk_power8.c +++ b/kernel/power/sdot_microk_power8.c @@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 48, 0, %3 \n\t" - "lxvw4x 41, %10, %2 \n\t" - "lxvw4x 49, %10, %3 \n\t" - "lxvw4x 42, %11, %2 \n\t" - "lxvw4x 50, %11, %3 \n\t" - "lxvw4x 43, %12, %2 \n\t" - "lxvw4x 51, %12, %3 \n\t" - "lxvw4x 44, %13, %2 \n\t" - "lxvw4x %x4, %13, %3 \n\t" - "lxvw4x 45, %14, %2 \n\t" - "lxvw4x %x5, %14, %3 \n\t" - "lxvw4x 46, %15, %2 \n\t" - "lxvw4x %x6, %15, %3 \n\t" - "lxvw4x 47, %16, %2 \n\t" - "lxvw4x %x7, %16, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 41, %10, %2 \n\t" + "lxvd2x 49, %10, %3 \n\t" + "lxvd2x 42, %11, %2 \n\t" + "lxvd2x 50, %11, %3 \n\t" + "lxvd2x 43, %12, %2 \n\t" + "lxvd2x 51, %12, %3 \n\t" + "lxvd2x 44, %13, %2 \n\t" + "lxvd2x %x4, %13, %3 \n\t" + "lxvd2x 45, %14, %2 \n\t" + "lxvd2x %x5, %14, %3 \n\t" + "lxvd2x 46, %15, %2 \n\t" + "lxvd2x %x6, %15, %3 \n\t" + "lxvd2x 47, %16, %2 \n\t" + "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" @@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y) "1: \n\t" "xvmaddasp 32, 40, 48 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 48, 0, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" "xvmaddasp 33, 41, 49 \n\t" - "lxvw4x 41, %10, %2 \n\t" - "lxvw4x 49, %10, %3 \n\t" + "lxvd2x 41, %10, %2 \n\t" + "lxvd2x 49, %10, %3 \n\t" "xvmaddasp 34, 42, 50 \n\t" - "lxvw4x 42, %11, %2 \n\t" - "lxvw4x 50, %11, %3 \n\t" + "lxvd2x 42, %11, %2 \n\t" + "lxvd2x 50, %11, %3 \n\t" "xvmaddasp 35, 43, 51 \n\t" - "lxvw4x 43, %12, %2 \n\t" - "lxvw4x 51, %12, %3 \n\t" + "lxvd2x 43, %12, %2 \n\t" + "lxvd2x 51, %12, %3 \n\t" "xvmaddasp 36, 44, %x4 \n\t" - "lxvw4x 44, %13, %2 \n\t" - "lxvw4x %x4, %13, %3 \n\t" + "lxvd2x 44, %13, %2 \n\t" + "lxvd2x %x4, %13, %3 \n\t" "xvmaddasp 37, 45, %x5 \n\t" - "lxvw4x 45, %14, %2 \n\t" - "lxvw4x %x5, %14, %3 \n\t" + "lxvd2x 45, %14, %2 \n\t" + "lxvd2x %x5, %14, %3 \n\t" "xvmaddasp 38, 46, %x6 \n\t" - "lxvw4x 46, %15, %2 \n\t" - "lxvw4x %x6, %15, %3 \n\t" + "lxvd2x 46, %15, %2 \n\t" + "lxvd2x %x6, %15, %3 \n\t" "xvmaddasp 39, 47, %x7 \n\t" - "lxvw4x 47, %16, %2 \n\t" - "lxvw4x %x7, %16, %3 \n\t" + "lxvd2x 47, %16, %2 \n\t" + "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" From edc97918f8e45e6e922d0e221cf103a4c736ca61 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:45:58 +1000 Subject: [PATCH 08/42] Optimise srot for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/srot_microk_power8.c | 64 +++++++++++++++---------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c index 0a18c16e0..6eecb60a1 100644 --- a/kernel/power/srot_microk_power8.c +++ b/kernel/power/srot_microk_power8.c @@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xscvdpspn 37, %x14 \n\t" // load s to all words "xxspltw 37, 37, 0 \n\t" - "lxvw4x 32, 0, %3 \n\t" // load x - "lxvw4x 33, %15, %3 \n\t" - "lxvw4x 34, %16, %3 \n\t" - "lxvw4x 35, %17, %3 \n\t" + "lxvd2x 32, 0, %3 \n\t" // load x + "lxvd2x 33, %15, %3 \n\t" + "lxvd2x 34, %16, %3 \n\t" + "lxvd2x 35, %17, %3 \n\t" - "lxvw4x 48, 0, %4 \n\t" // load y - "lxvw4x 49, %15, %4 \n\t" - "lxvw4x 50, %16, %4 \n\t" - "lxvw4x 51, %17, %4 \n\t" + "lxvd2x 48, 0, %4 \n\t" // load y + "lxvd2x 49, %15, %4 \n\t" + "lxvd2x 50, %16, %4 \n\t" + "lxvd2x 51, %17, %4 \n\t" "addi %3, %3, 64 \n\t" "addi %4, %4, 64 \n\t" @@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvmulsp 44, 32, 37 \n\t" // s * x "xvmulsp 45, 33, 37 \n\t" - "lxvw4x 32, 0, %3 \n\t" // load x - "lxvw4x 33, %15, %3 \n\t" + "lxvd2x 32, 0, %3 \n\t" // load x + "lxvd2x 33, %15, %3 \n\t" "xvmulsp 46, 34, 37 \n\t" "xvmulsp 47, 35, 37 \n\t" - "lxvw4x 34, %16, %3 \n\t" - "lxvw4x 35, %17, %3 \n\t" + "lxvd2x 34, %16, %3 \n\t" + "lxvd2x 35, %17, %3 \n\t" "xvmulsp %x9, 48, 37 \n\t" // s * y "xvmulsp %x10, 49, 37 \n\t" - "lxvw4x 48, 0, %4 \n\t" // load y - "lxvw4x 49, %15, %4 \n\t" + "lxvd2x 48, 0, %4 \n\t" // load y + "lxvd2x 49, %15, %4 \n\t" "xvmulsp %x11, 50, 37 \n\t" "xvmulsp %x12, 51, 37 \n\t" - "lxvw4x 50, %16, %4 \n\t" - "lxvw4x 51, %17, %4 \n\t" + "lxvd2x 50, %16, %4 \n\t" + "lxvd2x 51, %17, %4 \n\t" "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y @@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x - "stxvw4x 40, 0, %3 \n\t" // store x - "stxvw4x 41, %15, %3 \n\t" - "stxvw4x 42, %16, %3 \n\t" - "stxvw4x 43, %17, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" // store x + "stxvd2x 41, %15, %3 \n\t" + "stxvd2x 42, %16, %3 \n\t" + "stxvd2x 43, %17, %3 \n\t" - "stxvw4x %x5, 0, %4 \n\t" // store y - "stxvw4x %x6, %15, %4 \n\t" - "stxvw4x %x7, %16, %4 \n\t" - "stxvw4x %x8, %17, %4 \n\t" + "stxvd2x %x5, 0, %4 \n\t" // store y + "stxvd2x %x6, %15, %4 \n\t" + "stxvd2x %x7, %16, %4 \n\t" + "stxvd2x %x8, %17, %4 \n\t" "addi %3, %3, 128 \n\t" "addi %4, %4, 128 \n\t" @@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x - "stxvw4x 40, 0, %3 \n\t" // store x - "stxvw4x 41, %15, %3 \n\t" - "stxvw4x 42, %16, %3 \n\t" - "stxvw4x 43, %17, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" // store x + "stxvd2x 41, %15, %3 \n\t" + "stxvd2x 42, %16, %3 \n\t" + "stxvd2x 43, %17, %3 \n\t" - "stxvw4x %x5, 0, %4 \n\t" // store y - "stxvw4x %x6, %15, %4 \n\t" - "stxvw4x %x7, %16, %4 \n\t" - "stxvw4x %x8, %17, %4 \n" + "stxvd2x %x5, 0, %4 \n\t" // store y + "stxvd2x %x6, %15, %4 \n\t" + "stxvd2x %x7, %16, %4 \n\t" + "stxvd2x %x8, %17, %4 \n" "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n" "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12" From bd831a03a80d642693c786f7a65265ad40a50fc0 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 14 Jun 2017 16:47:56 +1000 Subject: [PATCH 09/42] Optimise sscal for POWER9 Use lxvd2x instruction instead of lxvw4x. lxvd2x performs far better on the new POWER architecture than lxvw4x. --- kernel/power/sscal_microk_power8.c | 80 +++++++++++++++--------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c index 49862a329..058ff3399 100644 --- a/kernel/power/sscal_microk_power8.c +++ b/kernel/power/sscal_microk_power8.c @@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xscvdpspn %x3, %x3 \n\t" "xxspltw %x3, %x3, 0 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %4, %2 \n\t" - "lxvw4x 34, %5, %2 \n\t" - "lxvw4x 35, %6, %2 \n\t" - "lxvw4x 36, %7, %2 \n\t" - "lxvw4x 37, %8, %2 \n\t" - "lxvw4x 38, %9, %2 \n\t" - "lxvw4x 39, %10, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %4, %2 \n\t" + "lxvd2x 34, %5, %2 \n\t" + "lxvd2x 35, %6, %2 \n\t" + "lxvd2x 36, %7, %2 \n\t" + "lxvd2x 37, %8, %2 \n\t" + "lxvd2x 38, %9, %2 \n\t" + "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %4, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %4, %2 \n\t" "xvmulsp 42, 34, %x3 \n\t" "xvmulsp 43, 35, %x3 \n\t" - "lxvw4x 34, %5, %2 \n\t" - "lxvw4x 35, %6, %2 \n\t" + "lxvd2x 34, %5, %2 \n\t" + "lxvd2x 35, %6, %2 \n\t" "xvmulsp 44, 36, %x3 \n\t" "xvmulsp 45, 37, %x3 \n\t" - "lxvw4x 36, %7, %2 \n\t" - "lxvw4x 37, %8, %2 \n\t" + "lxvd2x 36, %7, %2 \n\t" + "lxvd2x 37, %8, %2 \n\t" "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" - "lxvw4x 38, %9, %2 \n\t" - "lxvw4x 39, %10, %2 \n\t" + "lxvd2x 38, %9, %2 \n\t" + "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, -128 \n\t" - "stxvw4x 40, 0, %2 \n\t" - "stxvw4x 41, %4, %2 \n\t" - "stxvw4x 42, %5, %2 \n\t" - "stxvw4x 43, %6, %2 \n\t" - "stxvw4x 44, %7, %2 \n\t" - "stxvw4x 45, %8, %2 \n\t" - "stxvw4x 46, %9, %2 \n\t" - "stxvw4x 47, %10, %2 \n\t" + "stxvd2x 40, 0, %2 \n\t" + "stxvd2x 41, %4, %2 \n\t" + "stxvd2x 42, %5, %2 \n\t" + "stxvd2x 43, %6, %2 \n\t" + "stxvd2x 44, %7, %2 \n\t" + "stxvd2x 45, %8, %2 \n\t" + "stxvd2x 46, %9, %2 \n\t" + "stxvd2x 47, %10, %2 \n\t" "addi %2, %2, 256 \n\t" @@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" - "stxvw4x 40, 0, %2 \n\t" - "stxvw4x 41, %4, %2 \n\t" - "stxvw4x 42, %5, %2 \n\t" - "stxvw4x 43, %6, %2 \n\t" - "stxvw4x 44, %7, %2 \n\t" - "stxvw4x 45, %8, %2 \n\t" - "stxvw4x 46, %9, %2 \n\t" - "stxvw4x 47, %10, %2 \n" + "stxvd2x 40, 0, %2 \n\t" + "stxvd2x 41, %4, %2 \n\t" + "stxvd2x 42, %5, %2 \n\t" + "stxvd2x 43, %6, %2 \n\t" + "stxvd2x 44, %7, %2 \n\t" + "stxvd2x 45, %8, %2 \n\t" + "stxvd2x 46, %9, %2 \n\t" + "stxvd2x 47, %10, %2 \n" "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : @@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x) ".p2align 5 \n" "1: \n\t" - "stxvw4x %x3, 0, %2 \n\t" - "stxvw4x %x3, %4, %2 \n\t" - "stxvw4x %x3, %5, %2 \n\t" - "stxvw4x %x3, %6, %2 \n\t" - "stxvw4x %x3, %7, %2 \n\t" - "stxvw4x %x3, %8, %2 \n\t" - "stxvw4x %x3, %9, %2 \n\t" - "stxvw4x %x3, %10, %2 \n\t" + "stxvd2x %x3, 0, %2 \n\t" + "stxvd2x %x3, %4, %2 \n\t" + "stxvd2x %x3, %5, %2 \n\t" + "stxvd2x %x3, %6, %2 \n\t" + "stxvd2x %x3, %7, %2 \n\t" + "stxvd2x %x3, %8, %2 \n\t" + "stxvd2x %x3, %9, %2 \n\t" + "stxvd2x %x3, %10, %2 \n\t" "addi %2, %2, 128 \n\t" From 34513be72654102504f231cc27b33d26eddf88ac Mon Sep 17 00:00:00 2001 From: Neil Shipp Date: Fri, 23 Jun 2017 13:07:34 -0700 Subject: [PATCH 10/42] Add Microsoft Windows 10 UWP build support --- CMakeLists.txt | 9 ++++- cmake/c_check.cmake | 5 +++ cmake/prebuild.cmake | 62 ++++++++++++++++++------------- common.h | 13 ++++++- driver/others/CMakeLists.txt | 2 + driver/others/blas_server_win32.c | 7 +++- utest/CMakeLists.txt | 4 ++ 7 files changed, 70 insertions(+), 32 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c20a57eac..e6ae891b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h ) - ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h) + + ADD_CUSTOM_TARGET(genconfig + ALL + DEPENDS openblas_config.h + ) add_dependencies(genconfig ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) @@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME} message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") ADD_CUSTOM_TARGET(genf77blas + ALL COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h ) @@ -255,11 +260,11 @@ if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") ADD_CUSTOM_TARGET(gencblas + ALL COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp" COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h" DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h ) - add_dependencies(gencblas ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 115bdaf4e..56ae612ea 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF} "#define __${BINARY}BIT__\t1\n" "#define FUNDERSCORE\t${FU}\n") +if (${HOST_OS} STREQUAL "WINDOWSSTORE") + file(APPEND ${TARGET_CONF} + "#define OS_WINNT\t1\n") +endif () + diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 6a21c0bcc..a7f98bfb8 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -72,20 +72,26 @@ if (MSVC) set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) endif() +if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + # disable WindowsStore strict CRT checks + set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS) +endif () + set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) -try_compile(GETARCH_RESULT ${GETARCH_DIR} - SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GETARCH_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} -) +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GETARCH_RESULT ${GETARCH_DIR} + SOURCES ${GETARCH_SRC} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} + ) -if (NOT ${GETARCH_RESULT}) - MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") + if (NOT ${GETARCH_RESULT}) + MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") + endif () endif () - message(STATUS "Running getarch") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way @@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT}) set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH2_DIR}) -try_compile(GETARCH2_RESULT ${GETARCH2_DIR} - SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GETARCH2_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} -) +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GETARCH2_RESULT ${GETARCH2_DIR} + SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH2_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} + ) -if (NOT ${GETARCH2_RESULT}) - MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") + if (NOT ${GETARCH2_RESULT}) + MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") + endif () endif () # use the cmake binary w/ the -E param to run a shell command in a cross-platform way @@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}") set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"") file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR}) -try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} - SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GEN_CONFIG_H_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} -) +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} + SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GEN_CONFIG_H_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} + ) -if (NOT ${GEN_CONFIG_H_RESULT}) - MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") -endif () + if (NOT ${GEN_CONFIG_H_RESULT}) + MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") + endif () +endif () \ No newline at end of file diff --git a/common.h b/common.h index c9cc2f0f2..4463141c8 100644 --- a/common.h +++ b/common.h @@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #endif #ifndef ASSEMBLER +#ifdef OS_WINDOWSSTORE +typedef char env_var_t[MAX_PATH]; +#define readenv(p, n) 0 +#else #ifdef OS_WINDOWS typedef char env_var_t[MAX_PATH]; #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) @@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH]; typedef char* env_var_t; #define readenv(p, n) ((p)=getenv(n)) #endif +#endif #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) #ifdef _POSIX_MONOTONIC_CLOCK @@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){ *address = 0; } - +#ifdef OS_WINDOWSSTORE +static __inline int readenv_atoi(char *env) { + return 0; +} +#else #ifdef OS_WINDOWS static __inline int readenv_atoi(char *env) { env_var_t p; @@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) { return(0); } #endif - +#endif #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 489d40c76..8e0be1e0e 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -12,6 +12,8 @@ if (SMP) set(BLAS_SERVER blas_server_omp.c) elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(BLAS_SERVER blas_server_win32.c) + elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") + set(BLAS_SERVER blas_server_win32.c) endif () if (NOT DEFINED BLAS_SERVER) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 081bdd7d4..cde8ca793 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){ SetEvent(pool.killed); for(i = 0; i < blas_num_threads - 1; i++){ - WaitForSingleObject(blas_threads[i], 5); //INFINITE); - TerminateThread(blas_threads[i],0); + WaitForSingleObject(blas_threads[i], 5); //INFINITE); +#ifndef OS_WINDOWSSTORE +// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP + TerminateThread(blas_threads[i],0); +#endif } blas_server_avail = 0; diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 9cf518e05..bd31ed9c6 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -21,6 +21,10 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") target_link_libraries(${OpenBLAS_utest_bin} m) endif() +if (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") +set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES COMPILE_DEFINITIONS "_CRT_SECURE_NO_WARNINGS") +endif() + #Set output for utest set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) From 9b7b5f7fdc3f60e33c24f8de3dc708a50f7a33e4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Jun 2017 17:38:41 +0200 Subject: [PATCH 11/42] Add Elmar Peise's ReLAPACK --- relapack/LICENSE | 22 + relapack/Makefile | 64 ++ relapack/README.md | 68 ++ relapack/config.h | 208 ++++++ relapack/config.md | 87 +++ relapack/coverage.md | 212 ++++++ relapack/inc/relapack.h | 67 ++ relapack/src/blas.h | 61 ++ relapack/src/cgbtrf.c | 230 ++++++ relapack/src/cgemmt.c | 167 +++++ relapack/src/cgetrf.c | 117 +++ relapack/src/chegst.c | 212 ++++++ relapack/src/chetrf.c | 236 ++++++ relapack/src/chetrf_rec2.c | 520 ++++++++++++++ relapack/src/chetrf_rook.c | 236 ++++++ relapack/src/chetrf_rook_rec2.c | 661 +++++++++++++++++ relapack/src/clauum.c | 87 +++ relapack/src/cpbtrf.c | 157 ++++ relapack/src/cpotrf.c | 92 +++ relapack/src/csytrf.c | 238 ++++++ relapack/src/csytrf_rec2.c | 451 ++++++++++++ relapack/src/csytrf_rook.c | 236 ++++++ relapack/src/csytrf_rook_rec2.c | 565 +++++++++++++++ relapack/src/ctgsyl.c | 268 +++++++ relapack/src/ctrsyl.c | 163 +++++ relapack/src/ctrsyl_rec2.c | 392 ++++++++++ relapack/src/ctrtri.c | 107 +++ relapack/src/dgbtrf.c | 227 ++++++ relapack/src/dgemmt.c | 165 +++++ relapack/src/dgetrf.c | 117 +++ relapack/src/dlauum.c | 87 +++ relapack/src/dpbtrf.c | 157 ++++ relapack/src/dpotrf.c | 92 +++ relapack/src/dsygst.c | 212 ++++++ relapack/src/dsytrf.c | 238 ++++++ relapack/src/dsytrf_rec2.c | 352 +++++++++ relapack/src/dsytrf_rook.c | 236 ++++++ relapack/src/dsytrf_rook_rec2.c | 451 ++++++++++++ relapack/src/dtgsyl.c | 274 +++++++ relapack/src/dtrsyl.c | 169 +++++ relapack/src/dtrsyl_rec2.c | 1034 +++++++++++++++++++++++++++ relapack/src/dtrtri.c | 107 +++ relapack/src/f2c.c | 109 +++ relapack/src/f2c.h | 223 ++++++ relapack/src/lapack.h | 80 +++ relapack/src/lapack_wrappers.c | 607 ++++++++++++++++ relapack/src/lapack_wrappers.c.orig | 607 ++++++++++++++++ relapack/src/relapack.h | 60 ++ relapack/src/sgbtrf.c | 227 ++++++ relapack/src/sgemmt.c | 165 +++++ relapack/src/sgetrf.c | 117 +++ relapack/src/slauum.c | 87 +++ relapack/src/spbtrf.c | 157 ++++ relapack/src/spotrf.c | 92 +++ relapack/src/ssygst.c | 212 ++++++ relapack/src/ssytrf.c | 238 ++++++ relapack/src/ssytrf_rec2.c | 351 +++++++++ relapack/src/ssytrf_rook.c | 236 ++++++ relapack/src/ssytrf_rook_rec2.c | 451 ++++++++++++ relapack/src/stgsyl.c | 274 +++++++ relapack/src/strsyl.c | 169 +++++ relapack/src/strsyl_rec2.c | 1029 ++++++++++++++++++++++++++ relapack/src/strtri.c | 107 +++ relapack/src/zgbtrf.c | 230 ++++++ relapack/src/zgemmt.c | 167 +++++ relapack/src/zgetrf.c | 117 +++ relapack/src/zhegst.c | 212 ++++++ relapack/src/zhetrf.c | 236 ++++++ relapack/src/zhetrf_rec2.c | 524 ++++++++++++++ relapack/src/zhetrf_rook.c | 236 ++++++ relapack/src/zhetrf_rook_rec2.c | 662 +++++++++++++++++ relapack/src/zlauum.c | 87 +++ relapack/src/zpbtrf.c | 157 ++++ relapack/src/zpotrf.c | 92 +++ relapack/src/zsytrf.c | 238 ++++++ relapack/src/zsytrf_rec2.c | 452 ++++++++++++ relapack/src/zsytrf_rook.c | 236 ++++++ relapack/src/zsytrf_rook_rec2.c | 561 +++++++++++++++ relapack/src/ztgsyl.c | 268 +++++++ relapack/src/ztrsyl.c | 163 +++++ relapack/src/ztrsyl_rec2.c | 394 ++++++++++ relapack/src/ztrtri.c | 107 +++ 82 files changed, 20579 insertions(+) create mode 100644 relapack/LICENSE create mode 100644 relapack/Makefile create mode 100644 relapack/README.md create mode 100644 relapack/config.h create mode 100644 relapack/config.md create mode 100644 relapack/coverage.md create mode 100644 relapack/inc/relapack.h create mode 100644 relapack/src/blas.h create mode 100644 relapack/src/cgbtrf.c create mode 100644 relapack/src/cgemmt.c create mode 100644 relapack/src/cgetrf.c create mode 100644 relapack/src/chegst.c create mode 100644 relapack/src/chetrf.c create mode 100644 relapack/src/chetrf_rec2.c create mode 100644 relapack/src/chetrf_rook.c create mode 100644 relapack/src/chetrf_rook_rec2.c create mode 100644 relapack/src/clauum.c create mode 100644 relapack/src/cpbtrf.c create mode 100644 relapack/src/cpotrf.c create mode 100644 relapack/src/csytrf.c create mode 100644 relapack/src/csytrf_rec2.c create mode 100644 relapack/src/csytrf_rook.c create mode 100644 relapack/src/csytrf_rook_rec2.c create mode 100644 relapack/src/ctgsyl.c create mode 100644 relapack/src/ctrsyl.c create mode 100644 relapack/src/ctrsyl_rec2.c create mode 100644 relapack/src/ctrtri.c create mode 100644 relapack/src/dgbtrf.c create mode 100644 relapack/src/dgemmt.c create mode 100644 relapack/src/dgetrf.c create mode 100644 relapack/src/dlauum.c create mode 100644 relapack/src/dpbtrf.c create mode 100644 relapack/src/dpotrf.c create mode 100644 relapack/src/dsygst.c create mode 100644 relapack/src/dsytrf.c create mode 100644 relapack/src/dsytrf_rec2.c create mode 100644 relapack/src/dsytrf_rook.c create mode 100644 relapack/src/dsytrf_rook_rec2.c create mode 100644 relapack/src/dtgsyl.c create mode 100644 relapack/src/dtrsyl.c create mode 100644 relapack/src/dtrsyl_rec2.c create mode 100644 relapack/src/dtrtri.c create mode 100644 relapack/src/f2c.c create mode 100644 relapack/src/f2c.h create mode 100644 relapack/src/lapack.h create mode 100644 relapack/src/lapack_wrappers.c create mode 100644 relapack/src/lapack_wrappers.c.orig create mode 100644 relapack/src/relapack.h create mode 100644 relapack/src/sgbtrf.c create mode 100644 relapack/src/sgemmt.c create mode 100644 relapack/src/sgetrf.c create mode 100644 relapack/src/slauum.c create mode 100644 relapack/src/spbtrf.c create mode 100644 relapack/src/spotrf.c create mode 100644 relapack/src/ssygst.c create mode 100644 relapack/src/ssytrf.c create mode 100644 relapack/src/ssytrf_rec2.c create mode 100644 relapack/src/ssytrf_rook.c create mode 100644 relapack/src/ssytrf_rook_rec2.c create mode 100644 relapack/src/stgsyl.c create mode 100644 relapack/src/strsyl.c create mode 100644 relapack/src/strsyl_rec2.c create mode 100644 relapack/src/strtri.c create mode 100644 relapack/src/zgbtrf.c create mode 100644 relapack/src/zgemmt.c create mode 100644 relapack/src/zgetrf.c create mode 100644 relapack/src/zhegst.c create mode 100644 relapack/src/zhetrf.c create mode 100644 relapack/src/zhetrf_rec2.c create mode 100644 relapack/src/zhetrf_rook.c create mode 100644 relapack/src/zhetrf_rook_rec2.c create mode 100644 relapack/src/zlauum.c create mode 100644 relapack/src/zpbtrf.c create mode 100644 relapack/src/zpotrf.c create mode 100644 relapack/src/zsytrf.c create mode 100644 relapack/src/zsytrf_rec2.c create mode 100644 relapack/src/zsytrf_rook.c create mode 100644 relapack/src/zsytrf_rook_rec2.c create mode 100644 relapack/src/ztgsyl.c create mode 100644 relapack/src/ztrsyl.c create mode 100644 relapack/src/ztrsyl_rec2.c create mode 100644 relapack/src/ztrtri.c diff --git a/relapack/LICENSE b/relapack/LICENSE new file mode 100644 index 000000000..edeb4046e --- /dev/null +++ b/relapack/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2016 Elmar Peise + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/relapack/Makefile b/relapack/Makefile new file mode 100644 index 000000000..1e81b5423 --- /dev/null +++ b/relapack/Makefile @@ -0,0 +1,64 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + + + +SRC = $(wildcard src/*.c) +OBJS = $(SRC:%.c=%.o) + +TEST_SUITS = \ + slauum dlauum clauum zlauum \ + spotrf dpotrf cpotrf zpotrf \ + spbtrf dpbtrf cpbtrf zpbtrf \ + ssygst dsygst chegst zhegst \ + ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ + sgetrf dgetrf cgetrf zgetrf \ + sgbtrf dgbtrf cgbtrf zgbtrf \ + strsyl dtrsyl ctrsyl ztrsyl \ + stgsyl dtgsyl ctgsyl ztgsyl \ + sgemmt dgemmt cgemmt zgemmt +TESTS = $(TEST_SUITS:%=test/%.pass) # dummies +TEST_EXES = $(TEST_SUITS:%=test/%.x) + +LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm + +.SECONDARY: $(TEST_EXES) +.PHONY: test + +# ReLAPACK compilation + +libs: $(OBJS) + @echo "Building ReLAPACK library $(LIBNAME)" + $(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS) + $(RANLIB) $(TOPDIR)/$(LIBNAME) + +%.o: %.c config.h + $(CC) $(CFLAGS) -c $< -o $@ + + +# ReLAPACK testing + +test: $(TEST_EXES) $(TESTS) + @echo "passed all tests" + +test/%.pass: test/%.x + @echo -n $*: + @./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<) + +test/s%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) + +test/d%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) + +test/c%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) + +test/z%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) + + +# cleaning up + +clean: + rm -f $(OBJS) test/util.o test/*.x diff --git a/relapack/README.md b/relapack/README.md new file mode 100644 index 000000000..1947c1748 --- /dev/null +++ b/relapack/README.md @@ -0,0 +1,68 @@ +ReLAPACK +======== + +[![Build Status](https://travis-ci.org/HPAC/ReLAPACK.svg?branch=master)](https://travis-ci.org/HPAC/ReLAPACK) + +[Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK) + +ReLAPACK offers a collection of recursive algorithms for many of LAPACK's +compute kernels. Since it preserves LAPACK's established interfaces, ReLAPACK +integrates effortlessly into existing application codes. ReLAPACK's routines +not only outperform the reference LAPACK but also improve upon the performance +of tuned implementations, such as OpenBLAS and MKL. + + +Coverage +-------- +For a detailed list of covered operations and an overview of operations to which +recursion is not efficiently applicable, see [coverage.md](coverage.md). + + +Installation +------------ +To compile with the default configuration, simply run `make` to create the +library `librelapack.a`. + +### Linking with MKL +Note that to link with MKL, you currently need to set the flag +`COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and +`ztrsyl`. For further configuration options see [config.md](config.md). + + +### Dependencies +ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked +kernels from [LAPACK](http://www.netlib.org/lapack/). There are many optimized +and machine specific implementations of these libraries, which are commonly +provided by hardware vendors or available as open source (e.g., +[OpenBLAS](http://www.openblas.net/)). + + +Testing +------- +ReLAPACK's test suite compares its routines numerically with LAPACK's +counterparts. To set up the tests (located int `test/`) you need to specify +link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then +`make test` runs the tests. For details on the performed tests, see +[test/README.md](test/README.md). + + +Examples +-------- +Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the +covered routines applies directly to ReLAPACK. A few separate examples are +given in `examples/`. For details, see [examples/README.md](examples/README.md). + + +Citing +------ +When referencing ReLAPACK, please cite the preprint of the paper +[Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763): + + @article{relapack, + author = {Elmar Peise and Paolo Bientinesi}, + title = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection}, + journal = {CoRR}, + volume = {abs/1602.06763}, + year = {2016}, + url = {http://arxiv.org/abs/1602.06763}, + } diff --git a/relapack/config.h b/relapack/config.h new file mode 100644 index 000000000..9113a712d --- /dev/null +++ b/relapack/config.h @@ -0,0 +1,208 @@ +#ifndef RELAPACK_CONFIG_H +#define RELAPACK_CONFIG_H + +// ReLAPACK configuration file. +// See also config.md + + +/////////////////////////////// +// BLAS/LAPACK obect symbols // +/////////////////////////////// + +// BLAS routines linked against have a trailing underscore +#define BLAS_UNDERSCORE 1 +// LAPACK routines linked against have a trailing underscore +#define LAPACK_UNDERSCORE BLAS_UNDERSCORE + +// Complex BLAS/LAPACK routines return their result in the first argument +// This option must be enabled when linking to MKL for ctrsyl and ztrsyl to +// work. +#define COMPLEX_FUNCTIONS_AS_ROUTINES 0 +#ifdef F_INTERFACE_INTEL +#define COMPLEX_FUNCTIONS_AS_ROUTINES 1 +#endif +#define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES +#define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES + +// The BLAS-like extension xgemmt is provided by an external library. +#define HAVE_XGEMMT 0 + + +//////////////////////////// +// Use malloc in ReLAPACK // +//////////////////////////// + +#define ALLOW_MALLOC 1 +// allow malloc in xsygst for improved performance +#define XSYGST_ALLOW_MALLOC ALLOW_MALLOC +// allow malloc in xsytrf if the passed work buffer is too small +#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC + + +//////////////////////////////// +// LAPACK routine replacement // +//////////////////////////////// +// The following macros specify which routines are included in the library under +// LAPACK's symbol names: 1 included, 0 not included + +#define INCLUDE_ALL 1 + +#define INCLUDE_XLAUUM INCLUDE_ALL +#define INCLUDE_SLAUUM INCLUDE_XLAUUM +#define INCLUDE_DLAUUM INCLUDE_XLAUUM +#define INCLUDE_CLAUUM INCLUDE_XLAUUM +#define INCLUDE_ZLAUUM INCLUDE_XLAUUM + +#define INCLUDE_XSYGST INCLUDE_ALL +#define INCLUDE_SSYGST INCLUDE_XSYGST +#define INCLUDE_DSYGST INCLUDE_XSYGST +#define INCLUDE_CHEGST INCLUDE_XSYGST +#define INCLUDE_ZHEGST INCLUDE_XSYGST + +#define INCLUDE_XTRTRI INCLUDE_ALL +#define INCLUDE_STRTRI INCLUDE_XTRTRI +#define INCLUDE_DTRTRI INCLUDE_XTRTRI +#define INCLUDE_CTRTRI INCLUDE_XTRTRI +#define INCLUDE_ZTRTRI INCLUDE_XTRTRI + +#define INCLUDE_XPOTRF INCLUDE_ALL +#define INCLUDE_SPOTRF INCLUDE_XPOTRF +#define INCLUDE_DPOTRF INCLUDE_XPOTRF +#define INCLUDE_CPOTRF INCLUDE_XPOTRF +#define INCLUDE_ZPOTRF INCLUDE_XPOTRF + +#define INCLUDE_XPBTRF INCLUDE_ALL +#define INCLUDE_SPBTRF INCLUDE_XPBTRF +#define INCLUDE_DPBTRF INCLUDE_XPBTRF +#define INCLUDE_CPBTRF INCLUDE_XPBTRF +#define INCLUDE_ZPBTRF INCLUDE_XPBTRF + +#define INCLUDE_XSYTRF INCLUDE_ALL +#define INCLUDE_SSYTRF INCLUDE_XSYTRF +#define INCLUDE_DSYTRF INCLUDE_XSYTRF +#define INCLUDE_CSYTRF INCLUDE_XSYTRF +#define INCLUDE_CHETRF INCLUDE_XSYTRF +#define INCLUDE_ZSYTRF INCLUDE_XSYTRF +#define INCLUDE_ZHETRF INCLUDE_XSYTRF +#define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF +#define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF +#define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF +#define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF +#define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF +#define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF + +#define INCLUDE_XGETRF INCLUDE_ALL +#define INCLUDE_SGETRF INCLUDE_XGETRF +#define INCLUDE_DGETRF INCLUDE_XGETRF +#define INCLUDE_CGETRF INCLUDE_XGETRF +#define INCLUDE_ZGETRF INCLUDE_XGETRF + +#define INCLUDE_XGBTRF INCLUDE_ALL +#define INCLUDE_SGBTRF INCLUDE_XGBTRF +#define INCLUDE_DGBTRF INCLUDE_XGBTRF +#define INCLUDE_CGBTRF INCLUDE_XGBTRF +#define INCLUDE_ZGBTRF INCLUDE_XGBTRF + +#define INCLUDE_XTRSYL INCLUDE_ALL +#define INCLUDE_STRSYL INCLUDE_XTRSYL +#define INCLUDE_DTRSYL INCLUDE_XTRSYL +#define INCLUDE_CTRSYL INCLUDE_XTRSYL +#define INCLUDE_ZTRSYL INCLUDE_XTRSYL + +#define INCLUDE_XTGSYL INCLUDE_ALL +#define INCLUDE_STGSYL INCLUDE_XTGSYL +#define INCLUDE_DTGSYL INCLUDE_XTGSYL +#define INCLUDE_CTGSYL INCLUDE_XTGSYL +#define INCLUDE_ZTGSYL INCLUDE_XTGSYL + +#define INCLUDE_XGEMMT 0 +#define INCLUDE_SGEMMT INCLUDE_XGEMMT +#define INCLUDE_DGEMMT INCLUDE_XGEMMT +#define INCLUDE_CGEMMT INCLUDE_XGEMMT +#define INCLUDE_ZGEMMT INCLUDE_XGEMMT + + +///////////////////// +// crossover sizes // +///////////////////// + +// default crossover size +#define CROSSOVER 24 + +// individual crossover sizes +#define CROSSOVER_XLAUUM CROSSOVER +#define CROSSOVER_SLAUUM CROSSOVER_XLAUUM +#define CROSSOVER_DLAUUM CROSSOVER_XLAUUM +#define CROSSOVER_CLAUUM CROSSOVER_XLAUUM +#define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM + +#define CROSSOVER_XSYGST CROSSOVER +#define CROSSOVER_SSYGST CROSSOVER_XSYGST +#define CROSSOVER_DSYGST CROSSOVER_XSYGST +#define CROSSOVER_CHEGST CROSSOVER_XSYGST +#define CROSSOVER_ZHEGST CROSSOVER_XSYGST + +#define CROSSOVER_XTRTRI CROSSOVER +#define CROSSOVER_STRTRI CROSSOVER_XTRTRI +#define CROSSOVER_DTRTRI CROSSOVER_XTRTRI +#define CROSSOVER_CTRTRI CROSSOVER_XTRTRI +#define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI + +#define CROSSOVER_XPOTRF CROSSOVER +#define CROSSOVER_SPOTRF CROSSOVER_XPOTRF +#define CROSSOVER_DPOTRF CROSSOVER_XPOTRF +#define CROSSOVER_CPOTRF CROSSOVER_XPOTRF +#define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF + +#define CROSSOVER_XPBTRF CROSSOVER +#define CROSSOVER_SPBTRF CROSSOVER_XPBTRF +#define CROSSOVER_DPBTRF CROSSOVER_XPBTRF +#define CROSSOVER_CPBTRF CROSSOVER_XPBTRF +#define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF + +#define CROSSOVER_XSYTRF CROSSOVER +#define CROSSOVER_SSYTRF CROSSOVER_XSYTRF +#define CROSSOVER_DSYTRF CROSSOVER_XSYTRF +#define CROSSOVER_CSYTRF CROSSOVER_XSYTRF +#define CROSSOVER_CHETRF CROSSOVER_XSYTRF +#define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF +#define CROSSOVER_ZHETRF CROSSOVER_XSYTRF +#define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF +#define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF +#define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF +#define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF +#define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF +#define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF + +#define CROSSOVER_XGETRF CROSSOVER +#define CROSSOVER_SGETRF CROSSOVER_XGETRF +#define CROSSOVER_DGETRF CROSSOVER_XGETRF +#define CROSSOVER_CGETRF CROSSOVER_XGETRF +#define CROSSOVER_ZGETRF CROSSOVER_XGETRF + +#define CROSSOVER_XGBTRF CROSSOVER +#define CROSSOVER_SGBTRF CROSSOVER_XGBTRF +#define CROSSOVER_DGBTRF CROSSOVER_XGBTRF +#define CROSSOVER_CGBTRF CROSSOVER_XGBTRF +#define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF + +#define CROSSOVER_XTRSYL CROSSOVER +#define CROSSOVER_STRSYL CROSSOVER_XTRSYL +#define CROSSOVER_DTRSYL CROSSOVER_XTRSYL +#define CROSSOVER_CTRSYL CROSSOVER_XTRSYL +#define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL + +#define CROSSOVER_XTGSYL CROSSOVER +#define CROSSOVER_STGSYL CROSSOVER_XTGSYL +#define CROSSOVER_DTGSYL CROSSOVER_XTGSYL +#define CROSSOVER_CTGSYL CROSSOVER_XTGSYL +#define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL + +// sytrf helper routine +#define CROSSOVER_XGEMMT CROSSOVER_XSYTRF +#define CROSSOVER_SGEMMT CROSSOVER_XGEMMT +#define CROSSOVER_DGEMMT CROSSOVER_XGEMMT +#define CROSSOVER_CGEMMT CROSSOVER_XGEMMT +#define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT + +#endif /* RELAPACK_CONFIG_H */ diff --git a/relapack/config.md b/relapack/config.md new file mode 100644 index 000000000..ea14be16a --- /dev/null +++ b/relapack/config.md @@ -0,0 +1,87 @@ +RELAPACK Configuration +====================== + +ReLAPACK has two configuration files: `make.inc`, which is included by the +Makefile, and `config.h` which is included in the source files. + + +Build and Testing Environment +----------------------------- +The build environment (compiler and flags) and the test configuration (linker +flags for BLAS and LAPACK) are specified in `make.inc`. The test matrix size +and error bounds are defined in `test/config.h`. + +The library `librelapack.a` is compiled by invoking `make`. The tests are +performed by either `make test` or calling `make` in the test folder. + + +BLAS/LAPACK complex function interfaces +--------------------------------------- +For BLAS and LAPACK functions that return a complex number, there exist two +conflicting (FORTRAN compiler dependent) calling conventions: either the result +is returned as a `struct` of two floating point numbers or an additional first +argument with a pointer to such a `struct` is used. By default ReLAPACK uses +the former (which is what gfortran uses), but it can switch to the latter by +setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK +specific counterparts) to `1` in `config.h`. + +**For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.** + +(Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases +will segfault or return errors on the order of 1 or larger.) + + +BLAS extension `xgemmt` +----------------------- +The LDL decompositions require a general matrix-matrix product that updates only +a triangular matrix called `xgemmt`. If the BLAS implementation linked against +provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`; +otherwise, ReLAPACK uses its own recursive implementation of these kernels. + +`xgemmt` is provided by MKL. + + +Routine Selection +----------------- +ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`). If the +corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to +`1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g., +`dgetrf_`). By default, wrappers for all routines are enabled. + + +Crossover Size +-------------- +The crossover size determines below which matrix sizes ReLAPACK's recursive +algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3 +routines. The crossover size is set in `config.h` and can be chosen either +globally for the entire library, by operation, or individually by routine. + + +Allowing Temporary Buffers +-------------------------- +Two of ReLAPACK's routines make use of temporary buffers, which are allocated +and freed within ReLAPACK. Setting `ALLOW_MALLOC` (or one of the routine +specific counterparts) to 0 in `config.h` will disable these buffers. The +affected routines are: + + * `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2. As in + LAPACK, this size can be queried by setting `lWork = -1` and the passed + buffer will be used if it is large enough; only if it is not, a local buffer + will be allocated. + + The advantage of this mechanism is that ReLAPACK will seamlessly work even + with codes that statically provide too little memory instead of breaking + them. + + * `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem + to standard form can use an auxiliary buffer of size n^2 / 2 to avoid + redundant computations. It thereby performs about 30% less FLOPs than + LAPACK. + + +FORTRAN symbol names +-------------------- +ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces. +Since these libraries usually have an underscore to their symbol names, ReLAPACK +has configuration switches in `config.h` to adjust the corresponding routine +names. diff --git a/relapack/coverage.md b/relapack/coverage.md new file mode 100644 index 000000000..8406b2078 --- /dev/null +++ b/relapack/coverage.md @@ -0,0 +1,212 @@ +Coverage of ReLAPACK +==================== + +This file lists all LAPACK compute routines that are covered by recursive +algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which +are not (yet) part of ReLAPACK. + + + +**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* + +- [List of covered LAPACK routines](#list-of-covered-lapack-routines) + - [`xlauum`](#xlauum) + - [`xsygst`](#xsygst) + - [`xtrtri`](#xtrtri) + - [`xpotrf`](#xpotrf) + - [`xpbtrf`](#xpbtrf) + - [`xsytrf`](#xsytrf) + - [`xgetrf`](#xgetrf) + - [`xgbtrf`](#xgbtrf) + - [`xtrsyl`](#xtrsyl) + - [`xtgsyl`](#xtgsyl) +- [Covered BLAS extension](#covered-blas-extension) + - [`xgemmt`](#xgemmt) +- [Not covered yet](#not-covered-yet) + - [`xpstrf`](#xpstrf) +- [Not covered: extra FLOPs](#not-covered-extra-flops) + - [QR decomposition (and related)](#qr-decomposition-and-related) + - [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal) + - [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal) + - [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg) + + + + +List of covered LAPACK routines +------------------------------- + +### `xlauum` +Multiplication of a triangular matrix with its (complex conjugate) transpose, +resulting in a symmetric (Hermitian) matrix. + +Routines: `slauum`, `dlauum`, `clauum`, `zlauum` + +Operations: +* A = L^T L +* A = U U^T + +### `xsygst` +Simultaneous two-sided multiplication of a symmetric matrix with a triangular +matrix and its transpose + +Routines: `ssygst`, `dsygst`, `chegst`, `zhegst` + +Operations: +* A = inv(L) A inv(L^T) +* A = inv(U^T) A inv(U) +* A = L^T A L +* A = U A U^T + +### `xtrtri` +Inversion of a triangular matrix + +Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri` + +Operations: +* L = inv(L) +* U = inv(U) + +### `xpotrf` +Cholesky decomposition of a symmetric (Hermitian) positive definite matrix + +Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf` + +Operations: +* L L^T = A +* U^T U = A + +### `xpbtrf` +Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix + +Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf` + +Operations: +* L L^T = A +* U^T U = A + +### `xsytrf` +LDL decomposition of a symmetric (or Hermitian) matrix + +Routines: +* `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`, +* `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`, + `zhetrf_rook` + +Operations: +* L D L^T = A +* U^T D U = A + +### `xgetrf` +LU decomposition of a general matrix with pivoting + +Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf` + +Operation: P L U = A + +### `xgbtrf` +LU decomposition of a general banded matrix with pivoting + +Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf` + +Operation: L U = A + +### `xtrsyl` +Solution of the quasi-triangular Sylvester equation + +Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl` + +Operations: +* A X + B Y = C -> X +* A^T X + B Y = C -> X +* A X + B^T Y = C -> X +* A^T X + B^T Y = C -> X +* A X - B Y = C -> X +* A^T X - B Y = C -> X +* A X - B^T Y = C -> X +* A^T X - B^T Y = C -> X + +### `xtgsyl` +Solution of the generalized Sylvester equations + +Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl` + +Operations: +* A R - L B = C, D R - L E = F -> L, R +* A^T R + D^T L = C, R B^T - L E^T = -F -> L, R + + +Covered BLAS extension +---------------------- + +### `xgemmt` +Matrix-matrix product updating only a triangular part of the result + +Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt` + +Operations: +* C = alpha A B + beta C +* C = alpha A B^T + beta C +* C = alpha A^T B + beta C +* C = alpha A^T B^T + beta C + + +Not covered yet +--------------- +The following operation is implemented as a blocked algorithm in LAPACK but +currently not yet covered in ReLAPACK as a recursive algorithm + +### `xpstrf` +Cholesky decomposition of a positive semi-definite matrix with complete pivoting. + +Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf` + +Operations: +* P L L^T P^T = A +* P U^T U P^T = A + + +Not covered: extra FLOPs +------------------------ +The following routines are not covered because recursive variants would require +considerably more FLOPs or operate on banded matrices. + +### QR decomposition (and related) +Routines: +* `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf` +* `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf` +* `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf` +* `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf` +* `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf` + +Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A + +Routines for multiplication with Q: +* `sormqr`, `dormqr`, `cunmqr`, `zunmqr` +* `sormrq`, `dormrq`, `cunmrq`, `zunmrq` +* `sormql`, `dormql`, `cunmql`, `zunmql` +* `sormlq`, `dormlq`, `cunmlq`, `zunmlq` +* `sormrz`, `dormrz`, `cunmrz`, `zunmrz` + +Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T + +Routines for construction of Q: +* `sorgqr`, `dorgqr`, `cungqr`, `zungqr` +* `sorgrq`, `dorgrq`, `cungrq`, `zungrq` +* `sorgql`, `dorgql`, `cungql`, `zungql` +* `sorglq`, `dorglq`, `cunglq`, `zunglq` + +### Symmetric reduction to tridiagonal +Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd` + +Operation: Q T Q^T = A + +### Symmetric reduction to bidiagonal +Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd` + +Operation: Q T P^T = A + +### Reduction to upper Hessenberg +Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd` + +Operation: Q H Q^T = A diff --git a/relapack/inc/relapack.h b/relapack/inc/relapack.h new file mode 100644 index 000000000..e421f352b --- /dev/null +++ b/relapack/inc/relapack.h @@ -0,0 +1,67 @@ +#ifndef RELAPACK_H +#define RELAPACK_H + +void RELAPACK_slauum(const char *, const int *, float *, const int *, int *); +void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *); +void RELAPACK_clauum(const char *, const int *, float *, const int *, int *); +void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *); + +void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *); +void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *); +void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *); +void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *); + +void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *); +void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *); +void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *); +void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *); + +void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *); +void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *); +void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *); +void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *); + +void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); + +void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *); +void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *); + +void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); + +void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); + +void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); + +void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); +void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); +void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); +void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); + +void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + +#endif /* RELAPACK_H */ diff --git a/relapack/src/blas.h b/relapack/src/blas.h new file mode 100644 index 000000000..7441c1033 --- /dev/null +++ b/relapack/src/blas.h @@ -0,0 +1,61 @@ +#ifndef BLAS_H +#define BLAS_H + +extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *); +extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *); +extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *); +extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *); + +extern void BLAS(sscal)(const int *, const float *, float *, const int *); +extern void BLAS(dscal)(const int *, const double *, double *, const int *); +extern void BLAS(cscal)(const int *, const float *, float *, const int *); +extern void BLAS(zscal)(const int *, const double *, double *, const int *); + +extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *); +extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *); + +extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); + +extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); + +extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); + +extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); + +extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); +extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); +extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); +extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); + +extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + +extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + +#if HAVE_XGEMMT +extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +#endif + +#endif /* BLAS_H */ diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c new file mode 100644 index 000000000..90b2c8789 --- /dev/null +++ b/relapack/src/cgbtrf.c @@ -0,0 +1,230 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *, + const int *, float *, const int *, int *, float *, const int *, float *, + const int *, int *); + + +/** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's cgbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html + * */ +void RELAPACK_cgbtrf( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kl < 0) + *info = -3; + else if (*ku < 0) + *info = -4; + else if (*ldAb < 2 * *kl + *ku + 1) + *info = -6; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CGBTRF", &minfo); + return; + } + + // Constant + const float ZERO[] = { 0., 0. }; + + // Result upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + 2 * kv; + + // Zero upper diagonal fill-in elements + int i, j; + for (j = 0; j < *n; j++) { + float *const A_j = A + 2 * *ldA * j; + for (i = MAX(0, j - kv); i < j - *ku; i++) + A_j[2 * i] = A_j[2 * i + 1] = 0.; + } + + // Allocate work space + const int n1 = CREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float)); + float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float)); + LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); + LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + + // Recursive kernel + RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); + + // Free work space + free(Workl); + free(Worku); +} + + +/** cgbtrf's recursive compute kernel */ +static void RELAPACK_cgbtrf_rec( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, + int *info +) { + + if (*n <= MAX(CROSSOVER_CGBTRF, 1)) { + // Unblocked + LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Loop iterators + int i, j; + + // Output upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + 2 * kv; + + // Splitting + const int n1 = MIN(CREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); + + // Ab_L * + // Ab_BR + float *const Ab_L = Ab; + float *const Ab_BR = Ab + 2 * *ldAb * n1; + + // A_L A_R + float *const A_L = A; + float *const A_R = A + 2 * *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * m1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * m1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // Banded splitting + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); + + // n1 n21 n22 + // m * A_Rl ARr + float *const A_Rl = A_R; + float *const A_Rr = A_R + 2 * *ldA * n21; + + // n1 n21 n22 + // m1 * A_TRl A_TRr + // m21 A_BLt A_BRtl A_BRtr + // m22 A_BLb A_BRbl A_BRbr + float *const A_TRl = A_TR; + float *const A_TRr = A_TR + 2 * *ldA * n21; + float *const A_BLt = A_BL; + float *const A_BLb = A_BL + 2 * m21; + float *const A_BRtl = A_BR; + float *const A_BRtr = A_BR + 2 * *ldA * n21; + float *const A_BRbl = A_BR + 2 * m21; + float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21; + + // recursion(Ab_L, ipiv_T) + RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); + + // Workl = A_BLb + LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); + + // partially redo swaps in A_L + for (i = 0; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); + else + BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); + } + } + + // apply pivots to A_Rl + LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); + + // apply pivots to A_Rr columnwise + for (j = 0; j < n22; j++) { + float *const A_Rrj = A_Rr + 2 * *ldA * j; + for (i = j; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + const float tmpr = A_Rrj[2 * i]; + const float tmpc = A_Rrj[2 * i + 1]; + A_Rrj[2 * i] = A_Rrj[2 * ip]; + A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1]; + A_Rrj[2 * ip] = tmpr; + A_Rrj[2 * ip + 1] = tmpc; + } + } + } + + // A_TRl = A_TL \ A_TRl + BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // Worku = A_TRr + LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); + // Worku = A_TL \ Worku + BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); + // A_TRr = Worku + LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); + // A_BRtl = A_BRtl - A_BLt * A_TRl + BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); + // A_BRbl = A_BRbl - Workl * A_TRl + BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); + // A_BRtr = A_BRtr - A_BLt * Worku + BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Workl * Worku + BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); + + // partially undo swaps in A_L + for (i = mn1 - 1; i >= 0; i--) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); + else + BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); + } + } + + // recursion(Ab_BR, ipiv_B) + RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + if (*info) + *info += n1; + // shift pivots + for (i = 0; i < mn2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/cgemmt.c b/relapack/src/cgemmt.c new file mode 100644 index 000000000..28e2b00b0 --- /dev/null +++ b/relapack/src/cgemmt.c @@ -0,0 +1,167 @@ +#include "relapack.h" + +static void RELAPACK_cgemmt_rec(const char *, const char *, const char *, + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); + +static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); + + +/** CGEMMT computes a matrix-matrix product with general matrices but updates + * only the upper or lower triangular part of the result matrix. + * + * This routine performs the same operation as the BLAS routine + * cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) + * but only updates the triangular part of C specified by uplo: + * If (*uplo == 'L'), only the lower triangular part of C is updated, + * otherwise the upper triangular part is updated. + * */ +void RELAPACK_cgemmt( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + +#if HAVE_XGEMMT + BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; +#else + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int ctransA = LAPACK(lsame)(transA, "C"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + const int ctransB = LAPACK(lsame)(transB, "C"); + int info = 0; + if (!lower && !upper) + info = 1; + else if (!tranA && !ctransA && !notransA) + info = 2; + else if (!tranB && !ctransB && !notransB) + info = 3; + else if (*n < 0) + info = 4; + else if (*k < 0) + info = 5; + else if (*ldA < MAX(1, notransA ? *n : *k)) + info = 8; + else if (*ldB < MAX(1, notransB ? *k : *n)) + info = 10; + else if (*ldC < MAX(1, *n)) + info = 13; + if (info) { + LAPACK(xerbla)("CGEMMT", &info); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C'); + const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C'); + + // Recursive kernel + RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); +#endif +} + + +/** cgemmt's recursive compute kernel */ +static void RELAPACK_cgemmt_rec( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + + if (*n <= MAX(CROSSOVER_CGEMMT, 1)) { + // Unblocked + RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; + } + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_T + // A_B + const float *const A_T = A; + const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1); + + // B_L B_R + const float *const B_L = B; + const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1); + + // C_TL C_TR + // C_BL C_BR + float *const C_TL = C; + float *const C_TR = C + 2 * *ldC * n1; + float *const C_BL = C + 2 * n1; + float *const C_BR = C + 2 * *ldC * n1 + 2 * n1; + + // recursion(C_TL) + RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); + + if (*uplo == 'L') + // C_BL = alpha A_B B_L + beta C_BL + BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); + else + // C_TR = alpha A_T B_R + beta C_TR + BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); + + // recursion(C_BR) + RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); +} + + +/** cgemmt's unblocked compute kernel */ +static void RELAPACK_cgemmt_rec2( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; + + int i; + for (i = 0; i < *n; i++) { + // A_0 + // A_i + const float *const A_0 = A; + const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i); + + // * B_i * + const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i); + + // * C_0i * + // * C_ii * + float *const C_0i = C + 2 * *ldC * i; + float *const C_ii = C + 2 * *ldC * i + 2 * i; + + if (*uplo == 'L') { + const int nmi = *n - i; + if (*transA == 'N') + BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + else + BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + } else { + const int ip1 = i + 1; + if (*transA == 'N') + BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + else + BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + } + } +} diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c new file mode 100644 index 000000000..b31a711d0 --- /dev/null +++ b/relapack/src/cgetrf.c @@ -0,0 +1,117 @@ +#include "relapack.h" + +static void RELAPACK_cgetrf_rec(const int *, const int *, float *, + const int *, int *, int *); + + +/** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's cgetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html + */ +void RELAPACK_cgetrf( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CGETRF", &minfo); + return; + } + + const int sn = MIN(*m, *n); + + RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info); + + // Right remainder + if (*m < *n) { + // Constants + const float ONE[] = { 1., 0. }; + const int iONE[] = { 1 }; + + // Splitting + const int rn = *n - *m; + + // A_L A_R + const float *const A_L = A; + float *const A_R = A + 2 * *ldA * *m; + + // A_R = apply(ipiv, A_R) + LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); + // A_R = A_L \ A_R + BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); + } +} + + +/** cgetrf's recursive compute kernel */ +static void RELAPACK_cgetrf_rec( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + + if (*n <= MAX(CROSSOVER_CGETRF, 1)) { + // Unblocked + LAPACK(cgetf2)(m, n, A, ldA, ipiv, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; + + // A_L A_R + float *const A_L = A; + float *const A_R = A + 2 * *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // recursion(A_L, ipiv_T) + RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + // apply pivots to A_R + LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); + + // A_TR = A_TL \ A_TR + BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_BL * A_TR + BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); + + // recursion(A_BR, ipiv_B) + RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); + if (*info) + *info += n1; + // apply pivots to A_BL + LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); + // shift pivots + int i; + for (i = 0; i < n2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/chegst.c b/relapack/src/chegst.c new file mode 100644 index 000000000..dff875017 --- /dev/null +++ b/relapack/src/chegst.c @@ -0,0 +1,212 @@ +#include "relapack.h" +#if XSYGST_ALLOW_MALLOC +#include "stdlib.h" +#endif + +static void RELAPACK_chegst_rec(const int *, const char *, const int *, + float *, const int *, const float *, const int *, + float *, const int *, int *); + + +/** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. + * + * This routine is functionally equivalent to LAPACK's chegst. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html + * */ +void RELAPACK_chegst( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (*itype < 1 || *itype > 3) + *info = -1; + else if (!lower && !upper) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + else if (*ldB < MAX(1, *n)) + *info = -7; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CHEGST", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Allocate work space + float *Work = NULL; + int lWork = 0; +#if XSYGST_ALLOW_MALLOC + const int n1 = CREC_SPLIT(*n); + lWork = n1 * (*n - n1); + Work = malloc(lWork * 2 * sizeof(float)); + if (!Work) + lWork = 0; +#endif + + // recursive kernel + RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); + + // Free work space +#if XSYGST_ALLOW_MALLOC + if (Work) + free(Work); +#endif +} + + +/** chegst's recursive compute kernel */ +static void RELAPACK_chegst_rec( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + float *Work, const int *lWork, int *info +) { + + if (*n <= MAX(CROSSOVER_CHEGST, 1)) { + // Unblocked + LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info); + return; + } + + // Constants + const float ZERO[] = { 0., 0. }; + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const float HALF[] = { .5, 0. }; + const float MHALF[] = { -.5, 0. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // B_TL B_TR + // B_BL B_BR + const float *const B_TL = B; + const float *const B_TR = B + 2 * *ldB * n1; + const float *const B_BL = B + 2 * n1; + const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // recursion(A_TL, B_TL) + RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); + + if (*itype == 1) + if (*uplo == 'L') { + // A_BL = A_BL / B_TL' + BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * B_BL * A_TL + BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + } else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' + BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR \ A_BL + BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL' \ A_TR + BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * A_TL * B_TR + BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + } else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR + BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR / B_BR + BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + else + if (*uplo == 'L') { + // A_BL = A_BL * B_TL + BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * A_BR * B_BL + BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + } else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL + BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR * A_BL + BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL * A_TR + BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * B_TR * A_BR + BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + } else + // A_TR = A_TR + 1/2 B_TR A_BR + BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' + BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + else + // A_TR = A_TR + 1/2 B_TR * A_BR + BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR * B_BR + BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + + // recursion(A_BR, B_BR) + RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); +} diff --git a/relapack/src/chetrf.c b/relapack/src/chetrf.c new file mode 100644 index 000000000..2928235e4 --- /dev/null +++ b/relapack/src/chetrf.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's chetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html + * */ +void RELAPACK_chetrf( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CHETRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** chetrf's recursive compute kernel */ +static void RELAPACK_chetrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_CHETRF, 3)) { + // Unblocked + if (top) { + LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + float *const A_BL_B = A + 2 * *n; + float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + 2 * n1; + float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + 2 * *ldA * n_rest; + float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/chetrf_rec2.c b/relapack/src/chetrf_rec2.c new file mode 100644 index 000000000..b5c8341b6 --- /dev/null +++ b/relapack/src/chetrf_rec2.c @@ -0,0 +1,520 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; +static int c__1 = 1; + +/** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method + * + * This routine is a minor modification of LAPACK's clahef. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int * + nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, + int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + float r__1, r__2, r__3, r__4; + complex q__1, q__2, q__3, q__4; + + /* Builtin functions */ + double sqrt(double), r_imag(complex *); + void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); + + /* Local variables */ + static int j, k; + static float t, r1; + static complex d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static float alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen), ccopy_(int *, complex *, int *, + complex *, int *), cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; + static float absakk; + extern /* Subroutine */ int clacgv_(int *, complex *, int *); + extern int icamax_(int *, complex *, int *); + extern /* Subroutine */ int csscal_(int *, float *, complex *, int + *); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + i__1 = k - 1; + ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k + kw * w_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + i__1 = k + kw * w_dim1; + i__2 = k + kw * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + i__1 = k + kw * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)); + if (k > 1) { + i__1 = k - 1; + imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + kw * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - 1; + ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + imax * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + i__1 = k - imax; + ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + i__1 = k - imax; + clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + (kw - 1) * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + i__1 = k - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + jmax + (kw - 1) * w_dim1]), dabs(r__2)); + if (imax > 1) { + i__1 = imax - 1; + jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + i__1 = jmax + (kw - 1) * w_dim1; + r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( + r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs( + r__2)); + rowmax = dmax(r__3,r__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (kw - 1) * w_dim1; + if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) { + kp = imax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = kk - 1 - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + i__1 = kk - 1 - kp; + clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + r1 = 1.f / a[i__1].r; + i__1 = k - 1; + csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + r_cnjg(&q__2, &d21); + c_div(&q__1, &w[k + kw * w_dim1], &q__2); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = q__1.r, d22.i = q__1.i; + q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1.f / (q__1.r - 1.f); + q__2.r = t, q__2.i = 0.f; + c_div(&q__1, &q__2, &d21); + d21.r = q__1.r, d21.i = q__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + k * a_dim1; + r_cnjg(&q__2, &d21); + i__3 = j + kw * w_dim1; + q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = + q__2.r * q__3.i + q__2.i * q__3.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1; + clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k - 2; + clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j <= *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + i__1 = k + k * w_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (k < *n) { + i__1 = *n - k; + ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * + w_dim1], &c__1); + } + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); + i__1 = k + k * w_dim1; + i__2 = k + k * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + i__1 = k + k * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + k * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = imax - k; + clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + imax * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (imax < *n) { + i__1 = *n - imax; + ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ + imax + 1 + (k + 1) * w_dim1], &c__1); + } + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + (k + 1) * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + i__1 = imax - k; + jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + jmax + (k + 1) * w_dim1]), dabs(r__2)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + i__1 = jmax + (k + 1) * w_dim1; + r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( + r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs( + r__2)); + rowmax = dmax(r__3,r__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (k + 1) * w_dim1; + if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) { + kp = imax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + + k * w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k + kstep - 1; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = kp - kk - 1; + ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + i__1 = kp - kk - 1; + clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + r1 = 1.f / a[i__1].r; + i__1 = *n - k; + csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + i__1 = *n - k; + clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = q__1.r, d11.i = q__1.i; + r_cnjg(&q__2, &d21); + c_div(&q__1, &w[k + k * w_dim1], &q__2); + d22.r = q__1.r, d22.i = q__1.i; + q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1.f / (q__1.r - 1.f); + q__2.r = t, q__2.i = 0.f; + c_div(&q__1, &q__2, &d21); + d21.r = q__1.r, d21.i = q__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + r_cnjg(&q__2, &d21); + i__3 = j + k * w_dim1; + q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = + q__2.r * q__3.i + q__2.i * q__3.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = *n - k; + clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = *n - k - 1; + clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j >= 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/chetrf_rook.c b/relapack/src/chetrf_rook.c new file mode 100644 index 000000000..086393d57 --- /dev/null +++ b/relapack/src/chetrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's chetrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html + * */ +void RELAPACK_chetrf_rook( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CHETRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** chetrf_rook's recursive compute kernel */ +static void RELAPACK_chetrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_CHETRF, 3)) { + // Unblocked + if (top) { + LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + float *const A_BL_B = A + 2 * *n; + float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + 2 * n1; + float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + 2 * *ldA * n_rest; + float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/chetrf_rook_rec2.c b/relapack/src/chetrf_rook_rec2.c new file mode 100644 index 000000000..a42cbfd44 --- /dev/null +++ b/relapack/src/chetrf_rook_rec2.c @@ -0,0 +1,661 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; +static int c__1 = 1; + +/** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method + * + * This routine is a minor modification of LAPACK's clahef_rook. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, complex *a, int *lda, int *ipiv, + complex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + float r__1, r__2; + complex q__1, q__2, q__3, q__4, q__5; + + /* Builtin functions */ + double sqrt(double), r_imag(complex *); + void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); + + /* Local variables */ + static int j, k, p; + static float t, r1; + static complex d11, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static float alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen); + static float sfmin; + extern /* Subroutine */ int ccopy_(int *, complex *, int *, + complex *, int *); + static int itemp; + extern /* Subroutine */ int cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; + static float stemp, absakk; + extern /* Subroutine */ int clacgv_(int *, complex *, int *); + extern int icamax_(int *, complex *, int *); + extern double slamch_(char *, ftnlen); + extern /* Subroutine */ int csscal_(int *, float *, complex *, int + *); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + sfmin = slamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + if (k > 1) { + i__1 = k - 1; + ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], & + c__1); + } + i__1 = k + kw * w_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + i__1 = k + kw * w_dim1; + i__2 = k + kw * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + i__1 = k + kw * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)); + if (k > 1) { + i__1 = k - 1; + imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + kw * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + r__1 = w[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + if (k > 1) { + i__1 = k - 1; + ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], + &c__1); + } + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + if (imax > 1) { + i__1 = imax - 1; + ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + } + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + imax * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + i__1 = k - imax; + ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + i__1 = k - imax; + clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + (kw - 1) * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[jmax + (kw - 1) * w_dim1]), dabs(r__2)); + } else { + rowmax = 0.f; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + i__1 = itemp + (kw - 1) * w_dim1; + stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[itemp + (kw - 1) * w_dim1]), dabs(r__2)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + i__1 = imax + (kw - 1) * w_dim1; + if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) { + kp = imax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = p + p * a_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = k - 1 - p; + ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + i__1 = k - 1 - p; + clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda); + if (p > 1) { + i__1 = p - 1; + ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = kk - 1 - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + i__1 = kk - 1 - kp; + clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + t = a[i__1].r; + if (dabs(t) >= sfmin) { + r1 = 1.f / t; + i__1 = k - 1; + csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + i__3 = ii + k * a_dim1; + q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L14: */ + } + } + i__1 = k - 1; + clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + r_cnjg(&q__2, &d21); + c_div(&q__1, &w[k + kw * w_dim1], &q__2); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = q__1.r, d22.i = q__1.i; + q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1.f / (q__1.r - 1.f); + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d21); + q__1.r = t * q__2.r, q__1.i = t * q__2.i; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + r_cnjg(&q__5, &d21); + c_div(&q__2, &q__3, &q__5); + q__1.r = t * q__2.r, q__1.i = t * q__2.i; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1; + clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k - 2; + clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + ++jj; + if (kstep == 2 && jp1 != jj && j <= *n) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = k + k * w_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (k < *n) { + i__1 = *n - k; + ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * + w_dim1], &c__1); + } + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + i__1 = k + k * w_dim1; + i__2 = k + k * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + i__1 = k + k * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + k * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + r__1 = w[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + if (k < *n) { + i__1 = *n - k; + ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k * + a_dim1], &c__1); + } + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = imax - k; + clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + imax * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (imax < *n) { + i__1 = *n - imax; + ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ + imax + 1 + (k + 1) * w_dim1], &c__1); + } + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + (k + 1) * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[jmax + (k + 1) * w_dim1]), dabs(r__2)); + } else { + rowmax = 0.f; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + i__1 = itemp + (k + 1) * w_dim1; + stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[itemp + (k + 1) * w_dim1]), dabs(r__2)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + i__1 = imax + (k + 1) * w_dim1; + if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) { + kp = imax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p + p * a_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = p - k - 1; + ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) * + a_dim1], lda); + i__1 = p - k - 1; + clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda); + if (p < *n) { + i__1 = *n - p; + ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p + * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + } + cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = kp - kk - 1; + ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + i__1 = kp - kk - 1; + clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + t = a[i__1].r; + if (dabs(t) >= sfmin) { + r1 = 1.f / t; + i__1 = *n - k; + csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + i__3 = ii + k * a_dim1; + q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L74: */ + } + } + i__1 = *n - k; + clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = q__1.r, d11.i = q__1.i; + r_cnjg(&q__2, &d21); + c_div(&q__1, &w[k + k * w_dim1], &q__2); + d22.r = q__1.r, d22.i = q__1.i; + q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1.f / (q__1.r - 1.f); + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + r_cnjg(&q__5, &d21); + c_div(&q__2, &q__3, &q__5); + q__1.r = t * q__2.r, q__1.i = t * q__2.i; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d21); + q__1.r = t * q__2.r, q__1.i = t * q__2.i; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = *n - k; + clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = *n - k - 1; + clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + --jj; + if (kstep == 2 && jp1 != jj && j >= 1) { + cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/clauum.c b/relapack/src/clauum.c new file mode 100644 index 000000000..36d6297cf --- /dev/null +++ b/relapack/src/clauum.c @@ -0,0 +1,87 @@ +#include "relapack.h" + +static void RELAPACK_clauum_rec(const char *, const int *, float *, + const int *, int *); + + +/** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. + * + * This routine is functionally equivalent to LAPACK's clauum. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html + * */ +void RELAPACK_clauum( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CLAUUM", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info); +} + + +/** clauum's recursive compute kernel */ +static void RELAPACK_clauum_rec( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_CLAUUM, 1)) { + // Unblocked + LAPACK(clauu2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info); + + if (*uplo == 'L') { + // A_TL = A_TL + A_BL' * A_BL + BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); + // A_BL = A_BR' * A_BL + BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TL = A_TL + A_TR * A_TR' + BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); + // A_TR = A_TR * A_BR' + BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info); +} diff --git a/relapack/src/cpbtrf.c b/relapack/src/cpbtrf.c new file mode 100644 index 000000000..e0ea7b944 --- /dev/null +++ b/relapack/src/cpbtrf.c @@ -0,0 +1,157 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *, + float *, const int *, float *, const int *, int *); + + +/** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. + * + * This routine is functionally equivalent to LAPACK's cpbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html + * */ +void RELAPACK_cpbtrf( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kd < 0) + *info = -3; + else if (*ldAb < *kd + 1) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CPBTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Constant + const float ZERO[] = { 0., 0. }; + + // Allocate work space + const int n1 = CREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + float *Work = malloc(mWork * nWork * 2 * sizeof(float)); + LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); + + // Recursive kernel + RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); + + // Free work space + free(Work); +} + + +/** cpbtrf's recursive compute kernel */ +static void RELAPACK_cpbtrf_rec( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + float *Work, const int *ldWork, + int *info +){ + + if (*n <= MAX(CROSSOVER_CPBTRF, 1)) { + // Unblocked + LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); + + // Splitting + const int n1 = MIN(CREC_SPLIT(*n), *kd); + const int n2 = *n - n1; + + // * * + // * Ab_BR + float *const Ab_BR = Ab + 2 * *ldAb * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + // Banded splitting + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, *kd); + + // n1 n21 n22 + // n1 * A_TRl A_TRr + // n21 A_BLt A_BRtl A_BRtr + // n22 A_BLb A_BRbl A_BRbr + float *const A_TRl = A_TR; + float *const A_TRr = A_TR + 2 * *ldA * n21; + float *const A_BLt = A_BL; + float *const A_BLb = A_BL + 2 * n21; + float *const A_BRtl = A_BR; + float *const A_BRtr = A_BR + 2 * *ldA * n21; + float *const A_BRbl = A_BR + 2 * n21; + float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21; + + if (*uplo == 'L') { + // A_BLt = ABLt / A_TL' + BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); + // A_BRtl = A_BRtl - A_BLt * A_BLt' + BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); + // Work = A_BLb + LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); + // Work = Work / A_TL' + BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); + // A_BRbl = A_BRbl - Work * A_BLt' + BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); + // A_BRbr = A_BRbr - Work * Work' + BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_BLb = Work + LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); + } else { + // A_TRl = A_TL' \ A_TRl + BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // A_BRtl = A_BRtl - A_TRl' * A_TRl + BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); + // Work = A_TRr + LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); + // Work = A_TL' \ Work + BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); + // A_BRtr = A_BRtr - A_TRl' * Work + BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Work' * Work + BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_TRr = Work + LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); + } + + // recursion(A_BR) + if (*kd > n1) + RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info); + else + RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/cpotrf.c b/relapack/src/cpotrf.c new file mode 100644 index 000000000..e35caa7fa --- /dev/null +++ b/relapack/src/cpotrf.c @@ -0,0 +1,92 @@ +#include "relapack.h" + +static void RELAPACK_cpotrf_rec(const char *, const int *, float *, + const int *, int *); + + +/** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. + * + * This routine is functionally equivalent to LAPACK's cpotrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html + * */ +void RELAPACK_cpotrf( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CPOTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info); +} + + +/** cpotrf's recursive compute kernel */ +static void RELAPACK_cpotrf_rec( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_CPOTRF, 1)) { + // Unblocked + LAPACK(cpotf2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = A_BL / A_TL' + BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); + // A_BR = A_BR - A_BL * A_BL' + BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); + } else { + // A_TR = A_TL' \ A_TR + BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_TR' * A_TR + BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); + } + + // recursion(A_BR) + RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/csytrf.c b/relapack/src/csytrf.c new file mode 100644 index 000000000..01c161d1a --- /dev/null +++ b/relapack/src/csytrf.c @@ -0,0 +1,238 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's csytrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html + * */ +void RELAPACK_csytrf( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy arguments + int nout; + + // Recursive kernel + RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** csytrf's recursive compute kernel */ +static void RELAPACK_csytrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_CSYTRF, 3)) { + // Unblocked + if (top) { + LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + float *const A_BL_B = A + 2 * *n; + float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + 2 * n1; + float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + 2 * *ldA * n_rest; + float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/csytrf_rec2.c b/relapack/src/csytrf_rec2.c new file mode 100644 index 000000000..9d6bd849d --- /dev/null +++ b/relapack/src/csytrf_rec2.c @@ -0,0 +1,451 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; +static int c__1 = 1; + +/** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. + * + * This routine is a minor modification of LAPACK's clasyf. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int * + nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, + int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + float r__1, r__2, r__3, r__4; + complex q__1, q__2, q__3; + + /* Builtin functions */ + double sqrt(double), r_imag(complex *); + void c_div(complex *, complex *, complex *); + + /* Local variables */ + static int j, k; + static complex t, r1, d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static float alpha; + extern /* Subroutine */ int cscal_(int *, complex *, complex *, + int *); + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen), ccopy_(int *, complex *, int *, + complex *, int *), cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; + static float absakk; + extern int icamax_(int *, complex *, int *); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + kstep = 1; + i__1 = k + kw * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw * + w_dim1]), dabs(r__2)); + if (k > 1) { + i__1 = k - 1; + imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + kw * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + i__1 = k - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + jmax + (kw - 1) * w_dim1]), dabs(r__2)); + if (imax > 1) { + i__1 = imax - 1; + jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + i__1 = jmax + (kw - 1) * w_dim1; + r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( + r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs( + r__2)); + rowmax = dmax(r__3,r__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (kw - 1) * w_dim1; + if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha * + rowmax) { + kp = imax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kk - 1 - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + c_div(&q__1, &c_b1, &a[k + k * a_dim1]); + r1.r = q__1.r, r1.i = q__1.i; + i__1 = k - 1; + cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + c_div(&q__1, &w[k + kw * w_dim1], &d21); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = q__1.r, d22.i = q__1.i; + q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * + d22.i + d11.i * d22.r; + q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; + c_div(&q__1, &c_b1, &q__2); + t.r = q__1.r, t.i = q__1.i; + c_div(&q__1, &t, &d21); + d21.r = q__1.r, d21.i = q__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + i__1 = *n - k + 1; + ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); + kstep = 1; + i__1 = k + k * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k * + w_dim1]), dabs(r__2)); + if (k < *n) { + i__1 = *n - k; + imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + k * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax - k; + jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + jmax + (k + 1) * w_dim1]), dabs(r__2)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + i__1 = jmax + (k + 1) * w_dim1; + r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( + r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs( + r__2)); + rowmax = dmax(r__3,r__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (k + 1) * w_dim1; + if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha * + rowmax) { + kp = imax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + + k * w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k + kstep - 1; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kp - kk - 1; + ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + c_div(&q__1, &c_b1, &a[k + k * a_dim1]); + r1.r = q__1.r, r1.i = q__1.i; + i__1 = *n - k; + cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k + k * w_dim1], &d21); + d22.r = q__1.r, d22.i = q__1.i; + q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * + d22.i + d11.i * d22.r; + q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; + c_div(&q__1, &c_b1, &q__2); + t.r = q__1.r, t.i = q__1.i; + c_div(&q__1, &t, &d21); + d21.r = q__1.r, d21.i = q__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/csytrf_rook.c b/relapack/src/csytrf_rook.c new file mode 100644 index 000000000..aa7dd0e57 --- /dev/null +++ b/relapack/src/csytrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's csytrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html + * */ +void RELAPACK_csytrf_rook( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** csytrf_rook's recursive compute kernel */ +static void RELAPACK_csytrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) { + // Unblocked + if (top) { + LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + float *const A_BL_B = A + 2 * *n; + float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + 2 * n1; + float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + 2 * *ldA * n_rest; + float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/csytrf_rook_rec2.c b/relapack/src/csytrf_rook_rec2.c new file mode 100644 index 000000000..6638338a6 --- /dev/null +++ b/relapack/src/csytrf_rook_rec2.c @@ -0,0 +1,565 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; +static int c__1 = 1; + +/** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. + * + * This routine is a minor modification of LAPACK's clasyf_rook. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, complex *a, int *lda, int *ipiv, + complex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + float r__1, r__2; + complex q__1, q__2, q__3, q__4; + + /* Builtin functions */ + double sqrt(double), r_imag(complex *); + void c_div(complex *, complex *, complex *); + + /* Local variables */ + static int j, k, p; + static complex t, r1, d11, d12, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static float alpha; + extern /* Subroutine */ int cscal_(int *, complex *, complex *, + int *); + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen); + static float sfmin; + extern /* Subroutine */ int ccopy_(int *, complex *, int *, + complex *, int *); + static int itemp; + extern /* Subroutine */ int cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; + static float stemp, absakk; + extern int icamax_(int *, complex *, int *); + extern double slamch_(char *, ftnlen); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + sfmin = slamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + i__1 = k + kw * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw * + w_dim1]), dabs(r__2)); + if (k > 1) { + i__1 = k - 1; + imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + kw * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[jmax + (kw - 1) * w_dim1]), dabs(r__2)); + } else { + rowmax = 0.f; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + i__1 = itemp + (kw - 1) * w_dim1; + stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[itemp + (kw - 1) * w_dim1]), dabs(r__2)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + i__1 = imax + (kw - 1) * w_dim1; + if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha * + rowmax)) { + kp = imax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = k - p; + ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & + c__1); + i__1 = *n - k + 1; + cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], + lda); + i__1 = *n - kk + 1; + cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + i__1 = kp + k * a_dim1; + i__2 = kk + k * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = k - 1 - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & + c__1); + i__1 = *n - kk + 1; + cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], + lda); + i__1 = *n - kk + 1; + cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k + + k * a_dim1]), dabs(r__2)) >= sfmin) { + c_div(&q__1, &c_b1, &a[k + k * a_dim1]); + r1.r = q__1.r, r1.i = q__1.i; + i__1 = k - 1; + cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else /* if(complicated condition) */ { + i__1 = k + k * a_dim1; + if (a[i__1].r != 0.f || a[i__1].i != 0.f) { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + c_div(&q__1, &a[ii + k * a_dim1], &a[k + k * + a_dim1]); + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L14: */ + } + } + } + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d12.r = w[i__1].r, d12.i = w[i__1].i; + c_div(&q__1, &w[k + kw * w_dim1], &d12); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12); + d22.r = q__1.r, d22.i = q__1.i; + q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * + d22.i + d11.i * d22.r; + q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; + c_div(&q__1, &c_b1, &q__2); + t.r = q__1.r, t.i = q__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d12); + q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * + q__2.i + t.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d12); + q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * + q__2.i + t.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + jj = j - 1; + if (jp1 != jj && kstep == 2) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j <= *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = *n - k + 1; + ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + } + i__1 = k + k * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k * + w_dim1]), dabs(r__2)); + if (k < *n) { + i__1 = *n - k; + imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + k * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[jmax + (k + 1) * w_dim1]), dabs(r__2)); + } else { + rowmax = 0.f; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + i__1 = itemp + (k + 1) * w_dim1; + stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[itemp + (k + 1) * w_dim1]), dabs(r__2)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + i__1 = imax + (k + 1) * w_dim1; + if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha * + rowmax)) { + kp = imax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p - k; + ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], + lda); + i__1 = *n - p + 1; + ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & + c__1); + cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + i__1 = kp + k * a_dim1; + i__2 = kk + k * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kp - k - 1; + ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) + * a_dim1], lda); + i__1 = *n - kp + 1; + ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * + a_dim1], &c__1); + cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k + + k * a_dim1]), dabs(r__2)) >= sfmin) { + c_div(&q__1, &c_b1, &a[k + k * a_dim1]); + r1.r = q__1.r, r1.i = q__1.i; + i__1 = *n - k; + cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else /* if(complicated condition) */ { + i__1 = k + k * a_dim1; + if (a[i__1].r != 0.f || a[i__1].i != 0.f) { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + c_div(&q__1, &a[ii + k * a_dim1], &a[k + k * + a_dim1]); + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L74: */ + } + } + } + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k + k * w_dim1], &d21); + d22.r = q__1.r, d22.i = q__1.i; + q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * + d22.i + d11.i * d22.r; + q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; + c_div(&q__1, &c_b1, &q__2); + t.r = q__1.r, t.i = q__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d21); + q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * + q__2.i + t.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d21); + q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * + q__2.i + t.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + jj = j + 1; + if (jp1 != jj && kstep == 2) { + cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j >= 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/ctgsyl.c b/relapack/src/ctgsyl.c new file mode 100644 index 000000000..15c738baf --- /dev/null +++ b/relapack/src/ctgsyl.c @@ -0,0 +1,268 @@ +#include "relapack.h" +#include + +static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *, + const int *, const float *, const int *, const float *, const int *, + float *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, float *, float *, int *); + + +/** CTGSYL solves the generalized Sylvester equation. + * + * This routine is functionally equivalent to LAPACK's ctgsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html + * */ +void RELAPACK_ctgsyl( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + + // Parse arguments + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "C"); + + // Compute work buffer size + int lwmin = 1; + if (notran && (*ijob == 1 || *ijob == 2)) + lwmin = MAX(1, 2 * *m * *n); + *info = 0; + + // Check arguments + if (!tran && !notran) + *info = -1; + else if (notran && (*ijob < 0 || *ijob > 4)) + *info = -2; + else if (*m <= 0) + *info = -3; + else if (*n <= 0) + *info = -4; + else if (*ldA < MAX(1, *m)) + *info = -6; + else if (*ldB < MAX(1, *n)) + *info = -8; + else if (*ldC < MAX(1, *m)) + *info = -10; + else if (*ldD < MAX(1, *m)) + *info = -12; + else if (*ldE < MAX(1, *n)) + *info = -14; + else if (*ldF < MAX(1, *m)) + *info = -16; + else if (*lWork < lwmin && *lWork != -1) + *info = -20; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CTGSYL", &minfo); + return; + } + + if (*lWork == -1) { + // Work size query + *Work = lwmin; + return; + } + + // Clean char * arguments + const char cleantrans = notran ? 'N' : 'C'; + + // Constant + const float ZERO[] = { 0., 0. }; + + int isolve = 1; + int ifunc = 0; + if (notran) { + if (*ijob >= 3) { + ifunc = *ijob - 2; + LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF); + } else if (*ijob >= 1) + isolve = 2; + } + + float scale2; + int iround; + for (iround = 1; iround <= isolve; iround++) { + *scale = 1; + float dscale = 0; + float dsum = 1; + RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info); + if (dscale != 0) { + if (*ijob == 1 || *ijob == 3) + *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); + else + *dif = sqrt(*m * *n) / (dscale * sqrt(dsum)); + } + if (isolve == 2) { + if (iround == 1) { + if (notran) + ifunc = *ijob; + scale2 = *scale; + LAPACK(clacpy)("F", m, n, C, ldC, Work, m); + LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m); + LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF); + } else { + LAPACK(clacpy)("F", m, n, Work, m, C, ldC); + LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF); + *scale = scale2; + } + } + } +} + + +/** ctgsyl's recursive vompute kernel */ +static void RELAPACK_ctgsyl_rec( + const char *trans, const int *ifunc, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dsum, float *dscale, + int *info +) { + + if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) { + // Unblocked + LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Outputs + float scale1[] = { 1., 0. }; + float scale2[] = { 1., 0. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + const int m1 = CREC_SPLIT(*m); + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const float *const A_TL = A; + const float *const A_TR = A + 2 * *ldA * m1; + const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1; + + // C_T + // C_B + float *const C_T = C; + float *const C_B = C + 2 * m1; + + // D_TL D_TR + // 0 D_BR + const float *const D_TL = D; + const float *const D_TR = D + 2 * *ldD * m1; + const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1; + + // F_T + // F_B + float *const F_T = F; + float *const F_B = F + 2 * m1; + + if (*trans == 'N') { + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1); + // C_T = C_T - A_TR * C_B + BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // F_T = F_T - D_TR * C_B + BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); + } + } else { + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); + // C_B = C_B - A_TR^H * C_T + BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // C_B = C_B - D_TR^H * F_T + BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); + } + } + } else { + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const float *const B_TL = B; + const float *const B_TR = B + 2 * *ldB * n1; + const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // C_L C_R + float *const C_L = C; + float *const C_R = C + 2 * *ldC * n1; + + // E_TL E_TR + // 0 E_BR + const float *const E_TL = E; + const float *const E_TR = E + 2 * *ldE * n1; + const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1; + + // F_L F_R + float *const F_L = F; + float *const F_R = F + 2 * *ldF * n1; + + if (*trans == 'N') { + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1); + // C_R = C_R + F_L * B_TR + BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); + // F_R = F_R + F_L * E_TR + BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); + } + } else { + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); + // F_L = F_L + C_R * B_TR + BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); + // F_L = F_L + F_R * E_TR + BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); + } + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/ctrsyl.c b/relapack/src/ctrsyl.c new file mode 100644 index 000000000..b548d5354 --- /dev/null +++ b/relapack/src/ctrsyl.c @@ -0,0 +1,163 @@ +#include "relapack.h" + +static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *, + const int *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, int *); + + +/** CTRSYL solves the complex Sylvester matrix equation. + * + * This routine is functionally equivalent to LAPACK's ctrsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html + * */ +void RELAPACK_ctrsyl( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + + // Check arguments + const int notransA = LAPACK(lsame)(tranA, "N"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int ctransB = LAPACK(lsame)(tranB, "C"); + *info = 0; + if (!ctransA && !notransA) + *info = -1; + else if (!ctransB && !notransB) + *info = -2; + else if (*isgn != 1 && *isgn != -1) + *info = -3; + else if (*m < 0) + *info = -4; + else if (*n < 0) + *info = -5; + else if (*ldA < MAX(1, *m)) + *info = -7; + else if (*ldB < MAX(1, *n)) + *info = -9; + else if (*ldC < MAX(1, *m)) + *info = -11; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CTRSYL", &minfo); + return; + } + + // Clean char * arguments + const char cleantranA = notransA ? 'N' : 'C'; + const char cleantranB = notransB ? 'N' : 'C'; + + // Recursive kernel + RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} + + +/** ctrsyl's recursive compute kernel */ +static void RELAPACK_ctrsyl_rec( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + + if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) { + // Unblocked + RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const float MSGN[] = { -*isgn, 0. }; + const int iONE[] = { 1 }; + + // Outputs + float scale1[] = { 1., 0. }; + float scale2[] = { 1., 0. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + const int m1 = CREC_SPLIT(*m); + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const float *const A_TL = A; + const float *const A_TR = A + 2 * *ldA * m1; + const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1; + + // C_T + // C_B + float *const C_T = C; + float *const C_B = C + 2 * m1; + + if (*tranA == 'N') { + // recusion(A_BR, B, C_B) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); + // C_T = C_T - A_TR * C_B + BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // recusion(A_TL, B, C_T) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + } else { + // recusion(A_TL, B, C_T) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); + // C_B = C_B - A_TR' * C_T + BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // recusion(A_BR, B, C_B) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); + } + } else { + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const float *const B_TL = B; + const float *const B_TR = B + 2 * *ldB * n1; + const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // C_L C_R + float *const C_L = C; + float *const C_R = C + 2 * *ldC * n1; + + if (*tranB == 'N') { + // recusion(A, B_TL, C_L) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); + // C_R = C_R -/+ C_L * B_TR + BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); + // recusion(A, B_BR, C_R) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + } else { + // recusion(A, B_BR, C_R) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); + // C_L = C_L -/+ C_R * B_TR' + BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); + // recusion(A, B_TL, C_L) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c new file mode 100644 index 000000000..518574868 --- /dev/null +++ b/relapack/src/ctrsyl_rec2.c @@ -0,0 +1,392 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "../config.h" +#include "f2c.h" + +#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES +complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) { + extern void cdotu_(complex *, int *, complex *, int *, complex *, int *); + complex result; + cdotu_(&result, n, x, incx, y, incy); + return result; +} +#define cdotu_ cdotu_fun + +complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) { + extern void cdotc_(complex *, int *, complex *, int *, complex *, int *); + complex result; + cdotc_(&result, n, x, incx, y, incy); + return result; +} +#define cdotc_ cdotc_fun +#endif + +#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES +complex cladiv_fun(complex *a, complex *b) { + extern void cladiv_(complex *, complex *, complex *); + complex result; + cladiv_(&result, a, b); + return result; +} +#define cladiv_ cladiv_fun +#endif + +/* Table of constant values */ + +static int c__1 = 1; + +/** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) + * + * This routine is an exact copy of LAPACK's ctrsyl. + * It serves as an unblocked kernel in the recursive algorithms. + * */ +/* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int + *isgn, int *m, int *n, complex *a, int *lda, complex *b, + int *ldb, complex *c__, int *ldc, float *scale, int *info, + ftnlen trana_len, ftnlen tranb_len) +{ + /* System generated locals */ + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4; + float r__1, r__2; + complex q__1, q__2, q__3, q__4; + + /* Builtin functions */ + float r_imag(complex *); + void r_cnjg(complex *, complex *); + + /* Local variables */ + static int j, k, l; + static complex a11; + static float db; + static complex x11; + static float da11; + static complex vec; + static float dum[1], eps, sgn, smin; + static complex suml, sumr; + /* Complex */ complex cdotc_(int *, complex *, int + *, complex *, int *); + extern int lsame_(char *, char *, ftnlen, ftnlen); + /* Complex */ complex cdotu_(int *, complex *, int + *, complex *, int *); + extern /* Subroutine */ int slabad_(float *, float *); + extern float clange_(char *, int *, int *, complex *, + int *, float *, ftnlen); + /* Complex */ complex cladiv_(complex *, complex *); + static float scaloc; + extern float slamch_(char *, ftnlen); + extern /* Subroutine */ int csscal_(int *, float *, complex *, int + *), xerbla_(char *, int *, ftnlen); + static float bignum; + static int notrna, notrnb; + static float smlnum; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1; + c__ -= c_offset; + + /* Function Body */ + notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); + notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); + *info = 0; + if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < max(1,*m)) { + *info = -7; + } else if (*ldb < max(1,*n)) { + *info = -9; + } else if (*ldc < max(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CTRSY2", &i__1, (ftnlen)6); + return; + } + *scale = 1.f; + if (*m == 0 || *n == 0) { + return; + } + eps = slamch_("P", (ftnlen)1); + smlnum = slamch_("S", (ftnlen)1); + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); + smlnum = smlnum * (float) (*m * *n) / eps; + bignum = 1.f / smlnum; +/* Computing MAX */ + r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, ( + ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n, + &b[b_offset], ldb, dum, (ftnlen)1); + smin = dmax(r__1,r__2); + sgn = (float) (*isgn); + if (notrna && notrnb) { + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + for (k = *m; k >= 1; --k) { + i__2 = *m - k; +/* Computing MIN */ + i__3 = k + 1; +/* Computing MIN */ + i__4 = k + 1; + q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[ + min(i__4,*m) + l * c_dim1], &c__1); + suml.r = q__1.r, suml.i = q__1.i; + i__2 = l - 1; + q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] + , &c__1); + sumr.r = q__1.r, sumr.i = q__1.i; + i__2 = k + l * c_dim1; + q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i; + q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; + q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i; + vec.r = q__1.r, vec.i = q__1.i; + scaloc = 1.f; + i__2 = k + k * a_dim1; + i__3 = l + l * b_dim1; + q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i; + q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i; + a11.r = q__1.r, a11.i = q__1.i; + da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), + dabs(r__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.f; + da11 = smin; + *info = 1; + } + db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( + r__2)); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + q__3.r = scaloc, q__3.i = 0.f; + q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * + q__3.i + vec.i * q__3.r; + q__1 = cladiv_(&q__2, &a11); + x11.r = q__1.r, x11.i = q__1.i; + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L10: */ + } + *scale *= scaloc; + } + i__2 = k + l * c_dim1; + c__[i__2].r = x11.r, c__[i__2].i = x11.i; +/* L20: */ + } +/* L30: */ + } + } else if (! notrna && notrnb) { + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + i__2 = *m; + for (k = 1; k <= i__2; ++k) { + i__3 = k - 1; + q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l * + c_dim1 + 1], &c__1); + suml.r = q__1.r, suml.i = q__1.i; + i__3 = l - 1; + q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] + , &c__1); + sumr.r = q__1.r, sumr.i = q__1.i; + i__3 = k + l * c_dim1; + q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i; + q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; + q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i; + vec.r = q__1.r, vec.i = q__1.i; + scaloc = 1.f; + r_cnjg(&q__2, &a[k + k * a_dim1]); + i__3 = l + l * b_dim1; + q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i; + q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; + a11.r = q__1.r, a11.i = q__1.i; + da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), + dabs(r__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.f; + da11 = smin; + *info = 1; + } + db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( + r__2)); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + q__3.r = scaloc, q__3.i = 0.f; + q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * + q__3.i + vec.i * q__3.r; + q__1 = cladiv_(&q__2, &a11); + x11.r = q__1.r, x11.i = q__1.i; + if (scaloc != 1.f) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L40: */ + } + *scale *= scaloc; + } + i__3 = k + l * c_dim1; + c__[i__3].r = x11.r, c__[i__3].i = x11.i; +/* L50: */ + } +/* L60: */ + } + } else if (! notrna && ! notrnb) { + for (l = *n; l >= 1; --l) { + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = k - 1; + q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l * + c_dim1 + 1], &c__1); + suml.r = q__1.r, suml.i = q__1.i; + i__2 = *n - l; +/* Computing MIN */ + i__3 = l + 1; +/* Computing MIN */ + i__4 = l + 1; + q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[ + l + min(i__4,*n) * b_dim1], ldb); + sumr.r = q__1.r, sumr.i = q__1.i; + i__2 = k + l * c_dim1; + r_cnjg(&q__4, &sumr); + q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i; + q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; + q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i; + vec.r = q__1.r, vec.i = q__1.i; + scaloc = 1.f; + i__2 = k + k * a_dim1; + i__3 = l + l * b_dim1; + q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i; + q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i; + r_cnjg(&q__1, &q__2); + a11.r = q__1.r, a11.i = q__1.i; + da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), + dabs(r__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.f; + da11 = smin; + *info = 1; + } + db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( + r__2)); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + q__3.r = scaloc, q__3.i = 0.f; + q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * + q__3.i + vec.i * q__3.r; + q__1 = cladiv_(&q__2, &a11); + x11.r = q__1.r, x11.i = q__1.i; + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L70: */ + } + *scale *= scaloc; + } + i__2 = k + l * c_dim1; + c__[i__2].r = x11.r, c__[i__2].i = x11.i; +/* L80: */ + } +/* L90: */ + } + } else if (notrna && ! notrnb) { + for (l = *n; l >= 1; --l) { + for (k = *m; k >= 1; --k) { + i__1 = *m - k; +/* Computing MIN */ + i__2 = k + 1; +/* Computing MIN */ + i__3 = k + 1; + q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[ + min(i__3,*m) + l * c_dim1], &c__1); + suml.r = q__1.r, suml.i = q__1.i; + i__1 = *n - l; +/* Computing MIN */ + i__2 = l + 1; +/* Computing MIN */ + i__3 = l + 1; + q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[ + l + min(i__3,*n) * b_dim1], ldb); + sumr.r = q__1.r, sumr.i = q__1.i; + i__1 = k + l * c_dim1; + r_cnjg(&q__4, &sumr); + q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i; + q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; + q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i; + vec.r = q__1.r, vec.i = q__1.i; + scaloc = 1.f; + i__1 = k + k * a_dim1; + r_cnjg(&q__3, &b[l + l * b_dim1]); + q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i; + q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i; + a11.r = q__1.r, a11.i = q__1.i; + da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), + dabs(r__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.f; + da11 = smin; + *info = 1; + } + db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( + r__2)); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + q__3.r = scaloc, q__3.i = 0.f; + q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * + q__3.i + vec.i * q__3.r; + q__1 = cladiv_(&q__2, &a11); + x11.r = q__1.r, x11.i = q__1.i; + if (scaloc != 1.f) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L100: */ + } + *scale *= scaloc; + } + i__1 = k + l * c_dim1; + c__[i__1].r = x11.r, c__[i__1].i = x11.i; +/* L110: */ + } +/* L120: */ + } + } + return; +} diff --git a/relapack/src/ctrtri.c b/relapack/src/ctrtri.c new file mode 100644 index 000000000..0262cb59d --- /dev/null +++ b/relapack/src/ctrtri.c @@ -0,0 +1,107 @@ +#include "relapack.h" + +static void RELAPACK_ctrtri_rec(const char *, const char *, const int *, + float *, const int *, int *); + + +/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. + * + * This routine is functionally equivalent to LAPACK's ctrtri. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html + * */ +void RELAPACK_ctrtri( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (!nounit && !unit) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CTRTRI", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleandiag = nounit ? 'N' : 'U'; + + // check for singularity + if (nounit) { + int i; + for (i = 0; i < *n; i++) + if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { + *info = i; + return; + } + } + + // Recursive kernel + RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); +} + + +/** ctrtri's recursive compute kernel */ +static void RELAPACK_ctrtri_rec( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_CTRTRI, 1)) { + // Unblocked + LAPACK(ctrti2)(uplo, diag, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = - A_BL * A_TL + BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); + // A_BL = A_BR \ A_BL + BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TR = - A_TL * A_TR + BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); + // A_TR = A_TR / A_BR + BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c new file mode 100644 index 000000000..1a1757d31 --- /dev/null +++ b/relapack/src/dgbtrf.c @@ -0,0 +1,227 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *, + const int *, double *, const int *, int *, double *, const int *, double *, + const int *, int *); + + +/** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's dgbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html + * */ +void RELAPACK_dgbtrf( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kl < 0) + *info = -3; + else if (*ku < 0) + *info = -4; + else if (*ldAb < 2 * *kl + *ku + 1) + *info = -6; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DGBTRF", &minfo); + return; + } + + // Constant + const double ZERO[] = { 0. }; + + // Result upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + kv; + + // Zero upper diagonal fill-in elements + int i, j; + for (j = 0; j < *n; j++) { + double *const A_j = A + *ldA * j; + for (i = MAX(0, j - kv); i < j - *ku; i++) + A_j[i] = 0.; + } + + // Allocate work space + const int n1 = DREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + double *Workl = malloc(mWorkl * nWorkl * sizeof(double)); + double *Worku = malloc(mWorku * nWorku * sizeof(double)); + LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); + LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + + // Recursive kernel + RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); + + // Free work space + free(Workl); + free(Worku); +} + + +/** dgbtrf's recursive compute kernel */ +static void RELAPACK_dgbtrf_rec( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, + int *info +) { + + if (*n <= MAX(CROSSOVER_DGBTRF, 1)) { + // Unblocked + LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Loop iterators + int i, j; + + // Output upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + kv; + + // Splitting + const int n1 = MIN(DREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); + + // Ab_L * + // Ab_BR + double *const Ab_L = Ab; + double *const Ab_BR = Ab + *ldAb * n1; + + // A_L A_R + double *const A_L = A; + double *const A_R = A + *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + m1; + double *const A_BR = A + *ldA * n1 + m1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // Banded splitting + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); + + // n1 n21 n22 + // m * A_Rl ARr + double *const A_Rl = A_R; + double *const A_Rr = A_R + *ldA * n21; + + // n1 n21 n22 + // m1 * A_TRl A_TRr + // m21 A_BLt A_BRtl A_BRtr + // m22 A_BLb A_BRbl A_BRbr + double *const A_TRl = A_TR; + double *const A_TRr = A_TR + *ldA * n21; + double *const A_BLt = A_BL; + double *const A_BLb = A_BL + m21; + double *const A_BRtl = A_BR; + double *const A_BRtr = A_BR + *ldA * n21; + double *const A_BRbl = A_BR + m21; + double *const A_BRbr = A_BR + *ldA * n21 + m21; + + // recursion(Ab_L, ipiv_T) + RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); + + // Workl = A_BLb + LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); + + // partially redo swaps in A_L + for (i = 0; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); + else + BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); + } + } + + // apply pivots to A_Rl + LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); + + // apply pivots to A_Rr columnwise + for (j = 0; j < n22; j++) { + double *const A_Rrj = A_Rr + *ldA * j; + for (i = j; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + const double tmp = A_Rrj[i]; + A_Rrj[i] = A_Rr[ip]; + A_Rrj[ip] = tmp; + } + } + } + + // A_TRl = A_TL \ A_TRl + BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // Worku = A_TRr + LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); + // Worku = A_TL \ Worku + BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); + // A_TRr = Worku + LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); + // A_BRtl = A_BRtl - A_BLt * A_TRl + BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); + // A_BRbl = A_BRbl - Workl * A_TRl + BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); + // A_BRtr = A_BRtr - A_BLt * Worku + BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Workl * Worku + BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); + + // partially undo swaps in A_L + for (i = mn1 - 1; i >= 0; i--) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); + else + BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); + } + } + + // recursion(Ab_BR, ipiv_B) + RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + if (*info) + *info += n1; + // shift pivots + for (i = 0; i < mn2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/dgemmt.c b/relapack/src/dgemmt.c new file mode 100644 index 000000000..9c925b586 --- /dev/null +++ b/relapack/src/dgemmt.c @@ -0,0 +1,165 @@ +#include "relapack.h" + +static void RELAPACK_dgemmt_rec(const char *, const char *, const char *, + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); + +static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); + + +/** DGEMMT computes a matrix-matrix product with general matrices but updates + * only the upper or lower triangular part of the result matrix. + * + * This routine performs the same operation as the BLAS routine + * dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) + * but only updates the triangular part of C specified by uplo: + * If (*uplo == 'L'), only the lower triangular part of C is updated, + * otherwise the upper triangular part is updated. + * */ +void RELAPACK_dgemmt( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + +#if HAVE_XGEMMT + BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; +#else + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + int info = 0; + if (!lower && !upper) + info = 1; + else if (!tranA && !notransA) + info = 2; + else if (!tranB && !notransB) + info = 3; + else if (*n < 0) + info = 4; + else if (*k < 0) + info = 5; + else if (*ldA < MAX(1, notransA ? *n : *k)) + info = 8; + else if (*ldB < MAX(1, notransB ? *k : *n)) + info = 10; + else if (*ldC < MAX(1, *n)) + info = 13; + if (info) { + LAPACK(xerbla)("DGEMMT", &info); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleantransA = notransA ? 'N' : 'T'; + const char cleantransB = notransB ? 'N' : 'T'; + + // Recursive kernel + RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); +#endif +} + + +/** dgemmt's recursive compute kernel */ +static void RELAPACK_dgemmt_rec( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + + if (*n <= MAX(CROSSOVER_DGEMMT, 1)) { + // Unblocked + RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; + } + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_T + // A_B + const double *const A_T = A; + const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1); + + // B_L B_R + const double *const B_L = B; + const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1); + + // C_TL C_TR + // C_BL C_BR + double *const C_TL = C; + double *const C_TR = C + *ldC * n1; + double *const C_BL = C + n1; + double *const C_BR = C + *ldC * n1 + n1; + + // recursion(C_TL) + RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); + + if (*uplo == 'L') + // C_BL = alpha A_B B_L + beta C_BL + BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); + else + // C_TR = alpha A_T B_R + beta C_TR + BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); + + // recursion(C_BR) + RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); +} + + +/** dgemmt's unblocked compute kernel */ +static void RELAPACK_dgemmt_rec2( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; + + int i; + for (i = 0; i < *n; i++) { + // A_0 + // A_i + const double *const A_0 = A; + const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i); + + // * B_i * + const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i); + + // * C_0i * + // * C_ii * + double *const C_0i = C + *ldC * i; + double *const C_ii = C + *ldC * i + i; + + if (*uplo == 'L') { + const int nmi = *n - i; + if (*transA == 'N') + BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + else + BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + } else { + const int ip1 = i + 1; + if (*transA == 'N') + BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + else + BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + } + } +} diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c new file mode 100644 index 000000000..07f5472fd --- /dev/null +++ b/relapack/src/dgetrf.c @@ -0,0 +1,117 @@ +#include "relapack.h" + +static void RELAPACK_dgetrf_rec(const int *, const int *, double *, + const int *, int *, int *); + + +/** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's dgetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html + * */ +void RELAPACK_dgetrf( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DGETRF", &minfo); + return; + } + + const int sn = MIN(*m, *n); + + RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info); + + // Right remainder + if (*m < *n) { + // Constants + const double ONE[] = { 1. }; + const int iONE[] = { 1. }; + + // Splitting + const int rn = *n - *m; + + // A_L A_R + const double *const A_L = A; + double *const A_R = A + *ldA * *m; + + // A_R = apply(ipiv, A_R) + LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); + // A_R = A_S \ A_R + BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); + } +} + + +/** dgetrf's recursive compute kernel */ +static void RELAPACK_dgetrf_rec( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + + if (*n <= MAX(CROSSOVER_DGETRF, 1)) { + // Unblocked + LAPACK(dgetf2)(m, n, A, ldA, ipiv, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; + + // A_L A_R + double *const A_L = A; + double *const A_R = A + *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // recursion(A_L, ipiv_T) + RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + // apply pivots to A_R + LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); + + // A_TR = A_TL \ A_TR + BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_BL * A_TR + BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); + + // recursion(A_BR, ipiv_B) + RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); + if (*info) + *info += n1; + // apply pivots to A_BL + LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); + // shift pivots + int i; + for (i = 0; i < n2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/dlauum.c b/relapack/src/dlauum.c new file mode 100644 index 000000000..d722ea809 --- /dev/null +++ b/relapack/src/dlauum.c @@ -0,0 +1,87 @@ +#include "relapack.h" + +static void RELAPACK_dlauum_rec(const char *, const int *, double *, + const int *, int *); + + +/** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. + * + * This routine is functionally equivalent to LAPACK's dlauum. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html + * */ +void RELAPACK_dlauum( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DLAUUM", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info); +} + + +/** dlauum's recursive compute kernel */ +static void RELAPACK_dlauum_rec( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_DLAUUM, 1)) { + // Unblocked + LAPACK(dlauu2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info); + + if (*uplo == 'L') { + // A_TL = A_TL + A_BL' * A_BL + BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); + // A_BL = A_BR' * A_BL + BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TL = A_TL + A_TR * A_TR' + BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); + // A_TR = A_TR * A_BR' + BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info); +} diff --git a/relapack/src/dpbtrf.c b/relapack/src/dpbtrf.c new file mode 100644 index 000000000..6fd0ebe48 --- /dev/null +++ b/relapack/src/dpbtrf.c @@ -0,0 +1,157 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *, + double *, const int *, double *, const int *, int *); + + +/** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. + * + * This routine is functionally equivalent to LAPACK's dpbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html + * */ +void RELAPACK_dpbtrf( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kd < 0) + *info = -3; + else if (*ldAb < *kd + 1) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DPBTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Constant + const double ZERO[] = { 0. }; + + // Allocate work space + const int n1 = DREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + double *Work = malloc(mWork * nWork * sizeof(double)); + LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); + + // Recursive kernel + RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); + + // Free work space + free(Work); +} + + +/** dpbtrf's recursive compute kernel */ +static void RELAPACK_dpbtrf_rec( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + double *Work, const int *ldWork, + int *info +){ + + if (*n <= MAX(CROSSOVER_DPBTRF, 1)) { + // Unblocked + LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + ((*uplo == 'L') ? 0 : *kd); + + // Splitting + const int n1 = MIN(DREC_SPLIT(*n), *kd); + const int n2 = *n - n1; + + // * * + // * Ab_BR + double *const Ab_BR = Ab + *ldAb * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + // Banded splitting + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, n1); + + // n1 n21 n22 + // n1 * A_TRl A_TRr + // n21 A_BLt A_BRtl A_BRtr + // n22 A_BLb A_BRbl A_BRbr + double *const A_TRl = A_TR; + double *const A_TRr = A_TR + *ldA * n21; + double *const A_BLt = A_BL; + double *const A_BLb = A_BL + n21; + double *const A_BRtl = A_BR; + double *const A_BRtr = A_BR + *ldA * n21; + double *const A_BRbl = A_BR + n21; + double *const A_BRbr = A_BR + *ldA * n21 + n21; + + if (*uplo == 'L') { + // A_BLt = ABLt / A_TL' + BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); + // A_BRtl = A_BRtl - A_BLt * A_BLt' + BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); + // Work = A_BLb + LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); + // Work = Work / A_TL' + BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); + // A_BRbl = A_BRbl - Work * A_BLt' + BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); + // A_BRbr = A_BRbr - Work * Work' + BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_BLb = Work + LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); + } else { + // A_TRl = A_TL' \ A_TRl + BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // A_BRtl = A_BRtl - A_TRl' * A_TRl + BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); + // Work = A_TRr + LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); + // Work = A_TL' \ Work + BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); + // A_BRtr = A_BRtr - A_TRl' * Work + BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Work' * Work + BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_TRr = Work + LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); + } + + // recursion(A_BR) + if (*kd > n1) + RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info); + else + RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/dpotrf.c b/relapack/src/dpotrf.c new file mode 100644 index 000000000..c14fb3d71 --- /dev/null +++ b/relapack/src/dpotrf.c @@ -0,0 +1,92 @@ +#include "relapack.h" + +static void RELAPACK_dpotrf_rec(const char *, const int *, double *, + const int *, int *); + + +/** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. + * + * This routine is functionally equivalent to LAPACK's dpotrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html + * */ +void RELAPACK_dpotrf( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DPOTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info); +} + + +/** dpotrf's recursive compute kernel */ +static void RELAPACK_dpotrf_rec( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_DPOTRF, 1)) { + // Unblocked + LAPACK(dpotf2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = A_BL / A_TL' + BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); + // A_BR = A_BR - A_BL * A_BL' + BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); + } else { + // A_TR = A_TL' \ A_TR + BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_TR' * A_TR + BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); + } + + // recursion(A_BR) + RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/dsygst.c b/relapack/src/dsygst.c new file mode 100644 index 000000000..0228068ce --- /dev/null +++ b/relapack/src/dsygst.c @@ -0,0 +1,212 @@ +#include "relapack.h" +#if XSYGST_ALLOW_MALLOC +#include "stdlib.h" +#endif + +static void RELAPACK_dsygst_rec(const int *, const char *, const int *, + double *, const int *, const double *, const int *, + double *, const int *, int *); + + +/** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. + * + * This routine is functionally equivalent to LAPACK's dsygst. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html + * */ +void RELAPACK_dsygst( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (*itype < 1 || *itype > 3) + *info = -1; + else if (!lower && !upper) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + else if (*ldB < MAX(1, *n)) + *info = -7; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DSYGST", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Allocate work space + double *Work = NULL; + int lWork = 0; +#if XSYGST_ALLOW_MALLOC + const int n1 = DREC_SPLIT(*n); + lWork = n1 * (*n - n1); + Work = malloc(lWork * sizeof(double)); + if (!Work) + lWork = 0; +#endif + + // recursive kernel + RELAPACK_dsygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); + + // Free work space +#if XSYGST_ALLOW_MALLOC + if (Work) + free(Work); +#endif +} + + +/** dsygst's recursive compute kernel */ +static void RELAPACK_dsygst_rec( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + double *Work, const int *lWork, int *info +) { + + if (*n <= MAX(CROSSOVER_SSYGST, 1)) { + // Unblocked + LAPACK(dsygs2)(itype, uplo, n, A, ldA, B, ldB, info); + return; + } + + // Constants + const double ZERO[] = { 0. }; + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const double HALF[] = { .5 }; + const double MHALF[] = { -.5 }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // B_TL B_TR + // B_BL B_BR + const double *const B_TL = B; + const double *const B_TR = B + *ldB * n1; + const double *const B_BL = B + n1; + const double *const B_BR = B + *ldB * n1 + n1; + + // recursion(A_TL, B_TL) + RELAPACK_dsygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); + + if (*itype == 1) + if (*uplo == 'L') { + // A_BL = A_BL / B_TL' + BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * B_BL * A_TL + BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + } else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' + BLAS(dsyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR \ A_BL + BLAS(dtrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL' \ A_TR + BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * A_TL * B_TR + BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + } else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR + BLAS(dsyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR / B_BR + BLAS(dtrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + else + if (*uplo == 'L') { + // A_BL = A_BL * B_TL + BLAS(dtrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * A_BR * B_BL + BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + } else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL + BLAS(dsyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR * A_BL + BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL * A_TR + BLAS(dtrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * B_TR * A_BR + BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + } else + // A_TR = A_TR + 1/2 B_TR A_BR + BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' + BLAS(dsyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + else + // A_TR = A_TR + 1/2 B_TR * A_BR + BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR * B_BR + BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + + // recursion(A_BR, B_BR) + RELAPACK_dsygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); +} diff --git a/relapack/src/dsytrf.c b/relapack/src/dsytrf.c new file mode 100644 index 000000000..80b119336 --- /dev/null +++ b/relapack/src/dsytrf.c @@ -0,0 +1,238 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's dsytrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html + * */ +void RELAPACK_dsytrf( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy arguments + int nout; + + // Recursive kernel + RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** dsytrf's recursive compute kernel */ +static void RELAPACK_dsytrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_DSYTRF, 3)) { + // Unblocked + if (top) { + LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_dsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = DREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + double *const A_BL_B = A + *n; + double *const A_BR_B = A + *ldA * n1 + *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + n1; + double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = DREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + *ldA * n_rest; + double *const A_TR_T = A + *ldA * (n_rest + n1); + double *const A_TL = A + *ldA * n_rest + n_rest; + double *const A_TR = A + *ldA * (n_rest + n1) + n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/dsytrf_rec2.c b/relapack/src/dsytrf_rec2.c new file mode 100644 index 000000000..72ef827b1 --- /dev/null +++ b/relapack/src/dsytrf_rec2.c @@ -0,0 +1,352 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static double c_b8 = -1.; +static double c_b9 = 1.; + +/** DSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method. + * + * This routine is a minor modification of LAPACK's dlasyf. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int * + nb, int *kb, double *a, int *lda, int *ipiv, + double *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + double d__1, d__2, d__3; + + /* Builtin functions */ + double sqrt(double); + + /* Local variables */ + static int j, k; + static double t, r1, d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static double alpha; + extern /* Subroutine */ int dscal_(int *, double *, double *, + int *); + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int dgemv_(char *, int *, int *, + double *, double *, int *, double *, int *, + double *, double *, int *, ftnlen), dcopy_(int *, + double *, int *, double *, int *), dswap_(int + *, double *, int *, double *, int *); + static int kstep; + static double absakk; + extern int idamax_(int *, double *, int *); + static double colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + kstep = 1; + absakk = (d__1 = w[k + kw * w_dim1], abs(d__1)); + if (k > 1) { + i__1 = k - 1; + imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + i__1 = k - imax; + jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1)); + if (imax > 1) { + i__1 = imax - 1; + jmax = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + d__2 = rowmax, d__3 = (d__1 = w[jmax + (kw - 1) * w_dim1], + abs(d__1)); + rowmax = max(d__2,d__3); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else if ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) >= + alpha * rowmax) { + kp = imax; + dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; + i__1 = kk - 1 - kp; + dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + dcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + dswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + r1 = 1. / a[k + k * a_dim1]; + i__1 = k - 1; + dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + if (k > 2) { + d21 = w[k - 1 + kw * w_dim1]; + d11 = w[k + kw * w_dim1] / d21; + d22 = w[k - 1 + (kw - 1) * w_dim1] / d21; + t = 1. / (d11 * d22 - 1.); + d21 = t / d21; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1) + * w_dim1] - w[j + kw * w_dim1]); + a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] - + w[j + (kw - 1) * w_dim1]); +/* L20: */ + } + } + a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; + a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; + a[k + k * a_dim1] = w[k + kw * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + dswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + i__1 = *n - k + 1; + dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12); + kstep = 1; + absakk = (d__1 = w[k + k * w_dim1], abs(d__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + colmax = (d__1 = w[imax + k * w_dim1], abs(d__1)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax - k; + jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + d__2 = rowmax, d__3 = (d__1 = w[jmax + (k + 1) * w_dim1], + abs(d__1)); + rowmax = max(d__2,d__3); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else if ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) >= + alpha * rowmax) { + kp = imax; + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + kk = k + kstep - 1; + if (kp != kk) { + a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; + i__1 = kp - kk - 1; + dcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + dswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + r1 = 1. / a[k + k * a_dim1]; + i__1 = *n - k; + dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } + } else { + if (k < *n - 1) { + d21 = w[k + 1 + k * w_dim1]; + d11 = w[k + 1 + (k + 1) * w_dim1] / d21; + d22 = w[k + k * w_dim1] / d21; + t = 1. / (d11 * d22 - 1.); + d21 = t / d21; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] - + w[j + (k + 1) * w_dim1]); + a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) * + w_dim1] - w[j + k * w_dim1]); +/* L80: */ + } + } + a[k + k * a_dim1] = w[k + k * w_dim1]; + a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; + a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + dswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/dsytrf_rook.c b/relapack/src/dsytrf_rook.c new file mode 100644 index 000000000..19a875c7a --- /dev/null +++ b/relapack/src/dsytrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's dsytrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html + * */ +void RELAPACK_dsytrf_rook( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** dsytrf_rook's recursive compute kernel */ +static void RELAPACK_dsytrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) { + // Unblocked + if (top) { + LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_dsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = DREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + double *const A_BL_B = A + *n; + double *const A_BR_B = A + *ldA * n1 + *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + n1; + double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = DREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + *ldA * n_rest; + double *const A_TR_T = A + *ldA * (n_rest + n1); + double *const A_TL = A + *ldA * n_rest + n_rest; + double *const A_TR = A + *ldA * (n_rest + n1) + n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/dsytrf_rook_rec2.c b/relapack/src/dsytrf_rook_rec2.c new file mode 100644 index 000000000..105ef5ed3 --- /dev/null +++ b/relapack/src/dsytrf_rook_rec2.c @@ -0,0 +1,451 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static double c_b9 = -1.; +static double c_b10 = 1.; + +/** DSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method. + * + * This routine is a minor modification of LAPACK's dlasyf. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, double *a, int *lda, int *ipiv, + double *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + double d__1; + + /* Builtin functions */ + double sqrt(double); + + /* Local variables */ + static int j, k, p; + static double t, r1, d11, d12, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static double alpha; + extern /* Subroutine */ int dscal_(int *, double *, double *, + int *); + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int dgemv_(char *, int *, int *, + double *, double *, int *, double *, int *, + double *, double *, int *, ftnlen); + static double dtemp, sfmin; + static int itemp; + extern /* Subroutine */ int dcopy_(int *, double *, int *, + double *, int *), dswap_(int *, double *, int + *, double *, int *); + static int kstep; + extern double dlamch_(char *, ftnlen); + static double absakk; + extern int idamax_(int *, double *, int *); + static double colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + sfmin = dlamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + absakk = (d__1 = w[k + kw * w_dim1], abs(d__1)); + if (k > 1) { + i__1 = k - 1; + imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1)); + } else { + rowmax = 0.; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + dtemp = (d__1 = w[itemp + (kw - 1) * w_dim1], abs(d__1)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + if (! ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) < + alpha * rowmax)) { + kp = imax; + dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = k - p; + dcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + dcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & + c__1); + i__1 = *n - k + 1; + dswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], + lda); + i__1 = *n - kk + 1; + dswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + a[kp + k * a_dim1] = a[kk + k * a_dim1]; + i__1 = k - 1 - kp; + dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + dcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & + c__1); + i__1 = *n - kk + 1; + dswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], + lda); + i__1 = *n - kk + 1; + dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) { + r1 = 1. / a[k + k * a_dim1]; + i__1 = k - 1; + dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else if (a[k + k * a_dim1] != 0.) { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + a[ii + k * a_dim1] /= a[k + k * a_dim1]; +/* L14: */ + } + } + } + } else { + if (k > 2) { + d12 = w[k - 1 + kw * w_dim1]; + d11 = w[k + kw * w_dim1] / d12; + d22 = w[k - 1 + (kw - 1) * w_dim1] / d12; + t = 1. / (d11 * d22 - 1.); + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) * + w_dim1] - w[j + kw * w_dim1]) / d12); + a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] - + w[j + (kw - 1) * w_dim1]) / d12); +/* L20: */ + } + } + a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; + a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; + a[k + k * a_dim1] = w[k + kw * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + dswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + jj = j - 1; + if (jp1 != jj && kstep == 2) { + i__1 = *n - j + 1; + dswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j <= *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = *n - k + 1; + dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + } + absakk = (d__1 = w[k + k * w_dim1], abs(d__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + colmax = (d__1 = w[imax + k * w_dim1], abs(d__1)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1)); + } else { + rowmax = 0.; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + dtemp = (d__1 = w[itemp + (k + 1) * w_dim1], abs(d__1)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + if (! ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) < alpha + * rowmax)) { + kp = imax; + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p - k; + dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], + lda); + i__1 = *n - p + 1; + dcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & + c__1); + dswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + dswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + a[kp + k * a_dim1] = a[kk + k * a_dim1]; + i__1 = kp - k - 1; + dcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) + * a_dim1], lda); + i__1 = *n - kp + 1; + dcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * + a_dim1], &c__1); + dswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) { + r1 = 1. / a[k + k * a_dim1]; + i__1 = *n - k; + dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else if (a[k + k * a_dim1] != 0.) { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + a[ii + k * a_dim1] /= a[k + k * a_dim1]; +/* L74: */ + } + } + } + } else { + if (k < *n - 1) { + d21 = w[k + 1 + k * w_dim1]; + d11 = w[k + 1 + (k + 1) * w_dim1] / d21; + d22 = w[k + k * w_dim1] / d21; + t = 1. / (d11 * d22 - 1.); + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[ + j + (k + 1) * w_dim1]) / d21); + a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) * + w_dim1] - w[j + k * w_dim1]) / d21); +/* L80: */ + } + } + a[k + k * a_dim1] = w[k + k * w_dim1]; + a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; + a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + dswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + jj = j + 1; + if (jp1 != jj && kstep == 2) { + dswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j >= 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/dtgsyl.c b/relapack/src/dtgsyl.c new file mode 100644 index 000000000..c506926af --- /dev/null +++ b/relapack/src/dtgsyl.c @@ -0,0 +1,274 @@ +#include "relapack.h" +#include + +static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *, + const int *, const double *, const int *, const double *, const int *, + double *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, double *, double *, int *, + int *, int *); + + +/** DTGSYL solves the generalized Sylvester equation. + * + * This routine is functionally equivalent to LAPACK's dtgsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html + * */ +void RELAPACK_dtgsyl( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + + // Parse arguments + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "T"); + + // Compute work buffer size + int lwmin = 1; + if (notran && (*ijob == 1 || *ijob == 2)) + lwmin = MAX(1, 2 * *m * *n); + *info = 0; + + // Check arguments + if (!tran && !notran) + *info = -1; + else if (notran && (*ijob < 0 || *ijob > 4)) + *info = -2; + else if (*m <= 0) + *info = -3; + else if (*n <= 0) + *info = -4; + else if (*ldA < MAX(1, *m)) + *info = -6; + else if (*ldB < MAX(1, *n)) + *info = -8; + else if (*ldC < MAX(1, *m)) + *info = -10; + else if (*ldD < MAX(1, *m)) + *info = -12; + else if (*ldE < MAX(1, *n)) + *info = -14; + else if (*ldF < MAX(1, *m)) + *info = -16; + else if (*lWork < lwmin && *lWork != -1) + *info = -20; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DTGSYL", &minfo); + return; + } + + if (*lWork == -1) { + // Work size query + *Work = lwmin; + return; + } + + // Clean char * arguments + const char cleantrans = notran ? 'N' : 'T'; + + // Constant + const double ZERO[] = { 0. }; + + int isolve = 1; + int ifunc = 0; + if (notran) { + if (*ijob >= 3) { + ifunc = *ijob - 2; + LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF); + } else if (*ijob >= 1) + isolve = 2; + } + + double scale2; + int iround; + for (iround = 1; iround <= isolve; iround++) { + *scale = 1; + double dscale = 0; + double dsum = 1; + int pq; + RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); + if (dscale != 0) { + if (*ijob == 1 || *ijob == 3) + *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); + else + *dif = sqrt(pq) / (dscale * sqrt(dsum)); + } + if (isolve == 2) { + if (iround == 1) { + if (notran) + ifunc = *ijob; + scale2 = *scale; + LAPACK(dlacpy)("F", m, n, C, ldC, Work, m); + LAPACK(dlacpy)("F", m, n, F, ldF, Work + *m * *n, m); + LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF); + } else { + LAPACK(dlacpy)("F", m, n, Work, m, C, ldC); + LAPACK(dlacpy)("F", m, n, Work + *m * *n, m, F, ldF); + *scale = scale2; + } + } + } +} + + +/** dtgsyl's recursive vompute kernel */ +static void RELAPACK_dtgsyl_rec( + const char *trans, const int *ifunc, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dsum, double *dscale, + int *iWork, int *pq, int *info +) { + + if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) { + // Unblocked + LAPACK(dtgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Outputs + double scale1[] = { 1. }; + double scale2[] = { 1. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + int m1 = DREC_SPLIT(*m); + if (A[m1 + *ldA * (m1 - 1)]) + m1++; + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const double *const A_TL = A; + const double *const A_TR = A + *ldA * m1; + const double *const A_BR = A + *ldA * m1 + m1; + + // C_T + // C_B + double *const C_T = C; + double *const C_B = C + m1; + + // D_TL D_TR + // 0 D_BR + const double *const D_TL = D; + const double *const D_TR = D + *ldD * m1; + const double *const D_BR = D + *ldD * m1 + m1; + + // F_T + // F_B + double *const F_T = F; + double *const F_B = F + m1; + + if (*trans == 'N') { + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1); + // C_T = C_T - A_TR * C_B + BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // F_T = F_T - D_TR * C_B + BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); + } + } else { + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); + // C_B = C_B - A_TR^H * C_T + BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // C_B = C_B - D_TR^H * F_T + BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); + } + } + } else { + // Splitting + int n1 = DREC_SPLIT(*n); + if (B[n1 + *ldB * (n1 - 1)]) + n1++; + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const double *const B_TL = B; + const double *const B_TR = B + *ldB * n1; + const double *const B_BR = B + *ldB * n1 + n1; + + // C_L C_R + double *const C_L = C; + double *const C_R = C + *ldC * n1; + + // E_TL E_TR + // 0 E_BR + const double *const E_TL = E; + const double *const E_TR = E + *ldE * n1; + const double *const E_BR = E + *ldE * n1 + n1; + + // F_L F_R + double *const F_L = F; + double *const F_R = F + *ldF * n1; + + if (*trans == 'N') { + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1); + // C_R = C_R + F_L * B_TR + BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); + // F_R = F_R + F_L * E_TR + BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); + } + } else { + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); + // F_L = F_L + C_R * B_TR + BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); + // F_L = F_L + F_R * E_TR + BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); + } + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/dtrsyl.c b/relapack/src/dtrsyl.c new file mode 100644 index 000000000..c87b53ae5 --- /dev/null +++ b/relapack/src/dtrsyl.c @@ -0,0 +1,169 @@ +#include "relapack.h" + +static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *, + const int *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, int *); + + +/** DTRSYL solves the real Sylvester matrix equation. + * + * This routine is functionally equivalent to LAPACK's dtrsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html + * */ +void RELAPACK_dtrsyl( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + + // Check arguments + const int notransA = LAPACK(lsame)(tranA, "N"); + const int transA = LAPACK(lsame)(tranA, "T"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int transB = LAPACK(lsame)(tranB, "T"); + const int ctransB = LAPACK(lsame)(tranB, "C"); + *info = 0; + if (!transA && !ctransA && !notransA) + *info = -1; + else if (!transB && !ctransB && !notransB) + *info = -2; + else if (*isgn != 1 && *isgn != -1) + *info = -3; + else if (*m < 0) + *info = -4; + else if (*n < 0) + *info = -5; + else if (*ldA < MAX(1, *m)) + *info = -7; + else if (*ldB < MAX(1, *n)) + *info = -9; + else if (*ldC < MAX(1, *m)) + *info = -11; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DTRSYL", &minfo); + return; + } + + // Clean char * arguments + const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); + const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); + + // Recursive kernel + RELAPACK_dtrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} + + +/** dtrsyl's recursive compute kernel */ +static void RELAPACK_dtrsyl_rec( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + + if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) { + // Unblocked + RELAPACK_dtrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const double MSGN[] = { -*isgn }; + const int iONE[] = { 1 }; + + // Outputs + double scale1[] = { 1. }; + double scale2[] = { 1. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + int m1 = DREC_SPLIT(*m); + if (A[m1 + *ldA * (m1 - 1)]) + m1++; + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const double *const A_TL = A; + const double *const A_TR = A + *ldA * m1; + const double *const A_BR = A + *ldA * m1 + m1; + + // C_T + // C_B + double *const C_T = C; + double *const C_B = C + m1; + + if (*tranA == 'N') { + // recusion(A_BR, B, C_B) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); + // C_T = C_T - A_TR * C_B + BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // recusion(A_TL, B, C_T) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + } else { + // recusion(A_TL, B, C_T) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); + // C_B = C_B - A_TR' * C_T + BLAS(dgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // recusion(A_BR, B, C_B) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); + } + } else { + // Splitting + int n1 = DREC_SPLIT(*n); + if (B[n1 + *ldB * (n1 - 1)]) + n1++; + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const double *const B_TL = B; + const double *const B_TR = B + *ldB * n1; + const double *const B_BR = B + *ldB * n1 + n1; + + // C_L C_R + double *const C_L = C; + double *const C_R = C + *ldC * n1; + + if (*tranB == 'N') { + // recusion(A, B_TL, C_L) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); + // C_R = C_R -/+ C_L * B_TR + BLAS(dgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); + // recusion(A, B_BR, C_R) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + } else { + // recusion(A, B_BR, C_R) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); + // C_L = C_L -/+ C_R * B_TR' + BLAS(dgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); + // recusion(A, B_TL, C_L) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/dtrsyl_rec2.c b/relapack/src/dtrsyl_rec2.c new file mode 100644 index 000000000..479c7f340 --- /dev/null +++ b/relapack/src/dtrsyl_rec2.c @@ -0,0 +1,1034 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static int c_false = FALSE_; +static int c__2 = 2; +static double c_b26 = 1.; +static double c_b30 = 0.; +static int c_true = TRUE_; + +int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, int *isgn, int + *m, int *n, double *a, int *lda, double *b, int * + ldb, double *c__, int *ldc, double *scale, int *info, + ftnlen trana_len, ftnlen tranb_len) +{ + /* System generated locals */ + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4; + double d__1, d__2; + + /* Local variables */ + static int j, k, l; + static double x[4] /* was [2][2] */; + static int k1, k2, l1, l2; + static double a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, + sgn; + extern double ddot_(int *, double *, int *, double *, + int *); + static int ierr; + static double smin, suml, sumr; + extern /* Subroutine */ int dscal_(int *, double *, double *, + int *); + extern int lsame_(char *, char *, ftnlen, ftnlen); + static int knext, lnext; + static double xnorm; + extern /* Subroutine */ int dlaln2_(int *, int *, int *, + double *, double *, double *, int *, double *, + double *, double *, int *, double *, double * + , double *, int *, double *, double *, int *), + dlasy2_(int *, int *, int *, int *, int *, + double *, int *, double *, int *, double *, + int *, double *, double *, int *, double *, + int *), dlabad_(double *, double *); + extern double dlamch_(char *, ftnlen), dlange_(char *, int *, + int *, double *, int *, double *, ftnlen); + static double scaloc; + extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + static double bignum; + static int notrna, notrnb; + static double smlnum; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1; + c__ -= c_offset; + + /* Function Body */ + notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); + notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); + *info = 0; + if (! notrna && ! lsame_(trana, "T", (ftnlen)1, (ftnlen)1) && ! lsame_( + trana, "C", (ftnlen)1, (ftnlen)1)) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T", (ftnlen)1, (ftnlen)1) && ! + lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < max(1,*m)) { + *info = -7; + } else if (*ldb < max(1,*n)) { + *info = -9; + } else if (*ldc < max(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DTRSYL", &i__1, (ftnlen)6); + return 0; + } + *scale = 1.; + if (*m == 0 || *n == 0) { + return 0; + } + eps = dlamch_("P", (ftnlen)1); + smlnum = dlamch_("S", (ftnlen)1); + bignum = 1. / smlnum; + dlabad_(&smlnum, &bignum); + smlnum = smlnum * (double) (*m * *n) / eps; + bignum = 1. / smlnum; +/* Computing MAX */ + d__1 = smlnum, d__2 = eps * dlange_("M", m, m, &a[a_offset], lda, dum, ( + ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * dlange_("M", n, n, + &b[b_offset], ldb, dum, (ftnlen)1); + smin = max(d__1,d__2); + sgn = (double) (*isgn); + if (notrna && notrnb) { + lnext = 1; + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + if (l < lnext) { + goto L60; + } + if (l == *n) { + l1 = l; + l2 = l; + } else { + if (b[l + 1 + l * b_dim1] != 0.) { + l1 = l; + l2 = l + 1; + lnext = l + 2; + } else { + l1 = l; + l2 = l; + lnext = l + 1; + } + } + knext = *m; + for (k = *m; k >= 1; --k) { + if (k > knext) { + goto L50; + } + if (k == 1) { + k1 = k; + k2 = k; + } else { + if (a[k + (k - 1) * a_dim1] != 0.) { + k1 = k - 1; + k2 = k; + knext = k - 2; + } else { + k1 = k; + k2 = k; + knext = k - 1; + } + } + if (l1 == l2 && k1 == k2) { + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = abs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = abs(vec[0]); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L10: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + d__1 = -sgn * b[l1 + l1 * b_dim1]; + dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 + * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L20: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + d__1 = -sgn * a[k1 + k1 * a_dim1]; + dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 * + b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L30: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + dlasy2_(&c_false, &c_false, isgn, &c__2, &c__2, &a[k1 + + k1 * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, + &c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L40: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L50: + ; + } +L60: + ; + } + } else if (! notrna && notrnb) { + lnext = 1; + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + if (l < lnext) { + goto L120; + } + if (l == *n) { + l1 = l; + l2 = l; + } else { + if (b[l + 1 + l * b_dim1] != 0.) { + l1 = l; + l2 = l + 1; + lnext = l + 2; + } else { + l1 = l; + l2 = l; + lnext = l + 1; + } + } + knext = 1; + i__2 = *m; + for (k = 1; k <= i__2; ++k) { + if (k < knext) { + goto L110; + } + if (k == *m) { + k1 = k; + k2 = k; + } else { + if (a[k + 1 + k * a_dim1] != 0.) { + k1 = k; + k2 = k + 1; + knext = k + 2; + } else { + k1 = k; + k2 = k; + knext = k + 1; + } + } + if (l1 == l2 && k1 == k2) { + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = abs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = abs(vec[0]); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L70: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + d__1 = -sgn * b[l1 + l1 * b_dim1]; + dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 * + a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L80: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + d__1 = -sgn * a[k1 + k1 * a_dim1]; + dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 * + b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L90: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + dlasy2_(&c_true, &c_false, isgn, &c__2, &c__2, &a[k1 + k1 + * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L100: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L110: + ; + } +L120: + ; + } + } else if (! notrna && ! notrnb) { + lnext = *n; + for (l = *n; l >= 1; --l) { + if (l > lnext) { + goto L180; + } + if (l == 1) { + l1 = l; + l2 = l; + } else { + if (b[l + (l - 1) * b_dim1] != 0.) { + l1 = l - 1; + l2 = l; + lnext = l - 2; + } else { + l1 = l; + l2 = l; + lnext = l - 1; + } + } + knext = 1; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + if (k < knext) { + goto L170; + } + if (k == *m) { + k1 = k; + k2 = k; + } else { + if (a[k + 1 + k * a_dim1] != 0.) { + k1 = k; + k2 = k + 1; + knext = k + 2; + } else { + k1 = k; + k2 = k; + knext = k + 1; + } + } + if (l1 == l2 && k1 == k2) { + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l1; +/* Computing MIN */ + i__3 = l1 + 1; +/* Computing MIN */ + i__4 = l1 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = abs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = abs(vec[0]); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L130: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + d__1 = -sgn * b[l1 + l1 * b_dim1]; + dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 * + a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L140: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + d__1 = -sgn * a[k1 + k1 * a_dim1]; + dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 + * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L150: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + dlasy2_(&c_true, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 * + a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L160: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L170: + ; + } +L180: + ; + } + } else if (notrna && ! notrnb) { + lnext = *n; + for (l = *n; l >= 1; --l) { + if (l > lnext) { + goto L240; + } + if (l == 1) { + l1 = l; + l2 = l; + } else { + if (b[l + (l - 1) * b_dim1] != 0.) { + l1 = l - 1; + l2 = l; + lnext = l - 2; + } else { + l1 = l; + l2 = l; + lnext = l - 1; + } + } + knext = *m; + for (k = *m; k >= 1; --k) { + if (k > knext) { + goto L230; + } + if (k == 1) { + k1 = k; + k2 = k; + } else { + if (a[k + (k - 1) * a_dim1] != 0.) { + k1 = k - 1; + k2 = k; + knext = k - 2; + } else { + k1 = k; + k2 = k; + knext = k - 1; + } + } + if (l1 == l2 && k1 == k2) { + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l1; +/* Computing MIN */ + i__2 = l1 + 1; +/* Computing MIN */ + i__3 = l1 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = abs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = abs(vec[0]); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L190: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + d__1 = -sgn * b[l1 + l1 * b_dim1]; + dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 + * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L200: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + d__1 = -sgn * a[k1 + k1 * a_dim1]; + dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 + * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L210: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + dlasy2_(&c_false, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 + * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L220: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L230: + ; + } +L240: + ; + } + } + return 0; +} diff --git a/relapack/src/dtrtri.c b/relapack/src/dtrtri.c new file mode 100644 index 000000000..0462609e9 --- /dev/null +++ b/relapack/src/dtrtri.c @@ -0,0 +1,107 @@ +#include "relapack.h" + +static void RELAPACK_dtrtri_rec(const char *, const char *, const int *, + double *, const int *, int *); + + +/** DTRTRI computes the inverse of a real upper or lower triangular matrix A. + * + * This routine is functionally equivalent to LAPACK's dtrtri. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html + * */ +void RELAPACK_dtrtri( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (!nounit && !unit) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DTRTRI", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleandiag = nounit ? 'N' : 'U'; + + // check for singularity + if (nounit) { + int i; + for (i = 0; i < *n; i++) + if (A[i + *ldA * i] == 0) { + *info = i; + return; + } + } + + // Recursive kernel + RELAPACK_dtrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); +} + + +/** dtrtri's recursive compute kernel */ +static void RELAPACK_dtrtri_rec( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_DTRTRI, 1)) { + // Unblocked + LAPACK(dtrti2)(uplo, diag, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_dtrtri_rec(uplo, diag, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = - A_BL * A_TL + BLAS(dtrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); + // A_BL = A_BR \ A_BL + BLAS(dtrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TR = - A_TL * A_TR + BLAS(dtrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); + // A_TR = A_TR / A_BR + BLAS(dtrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_dtrtri_rec(uplo, diag, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/f2c.c b/relapack/src/f2c.c new file mode 100644 index 000000000..5a3452419 --- /dev/null +++ b/relapack/src/f2c.c @@ -0,0 +1,109 @@ +#include "stdlib.h" +#include "stdio.h" +#include "signal.h" +#include "f2c.h" + +#ifndef SIGIOT +#ifdef SIGABRT +#define SIGIOT SIGABRT +#endif +#endif + +void sig_die(const char *s, int kill) { + /* print error message, then clear buffers */ + fprintf(stderr, "%s\n", s); + + if(kill) { + fflush(stderr); + /* now get a core */ + signal(SIGIOT, SIG_DFL); + abort(); + } else + exit(1); +} + +void c_div(complex *c, complex *a, complex *b) { + double ratio, den; + double abr, abi, cr; + + if( (abr = b->r) < 0.) + abr = - abr; + if( (abi = b->i) < 0.) + abi = - abi; + if( abr <= abi ) { + if(abi == 0) { +#ifdef IEEE_COMPLEX_DIVIDE + float af, bf; + af = bf = abr; + if (a->i != 0 || a->r != 0) + af = 1.; + c->i = c->r = af / bf; + return; +#else + sig_die("complex division by zero", 1); +#endif + } + ratio = (double)b->r / b->i ; + den = b->i * (1 + ratio*ratio); + cr = (a->r*ratio + a->i) / den; + c->i = (a->i*ratio - a->r) / den; + } else { + ratio = (double)b->i / b->r ; + den = b->r * (1 + ratio*ratio); + cr = (a->r + a->i*ratio) / den; + c->i = (a->i - a->r*ratio) / den; + } + c->r = cr; +} + +void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) { + double ratio, den; + double abr, abi, cr; + + if( (abr = b->r) < 0.) + abr = - abr; + if( (abi = b->i) < 0.) + abi = - abi; + if( abr <= abi ) { + if(abi == 0) { +#ifdef IEEE_COMPLEX_DIVIDE + if (a->i != 0 || a->r != 0) + abi = 1.; + c->i = c->r = abi / abr; + return; +#else + sig_die("complex division by zero", 1); +#endif + } + ratio = b->r / b->i ; + den = b->i * (1 + ratio*ratio); + cr = (a->r*ratio + a->i) / den; + c->i = (a->i*ratio - a->r) / den; + } else { + ratio = b->i / b->r ; + den = b->r * (1 + ratio*ratio); + cr = (a->r + a->i*ratio) / den; + c->i = (a->i - a->r*ratio) / den; + } + c->r = cr; +} + +float r_imag(complex *z) { + return z->i; +} + +void r_cnjg(complex *r, complex *z) { + float zi = z->i; + r->r = z->r; + r->i = -zi; +} + +double d_imag(doublecomplex *z) { + return z->i; +} + +void d_cnjg(doublecomplex *r, doublecomplex *z) { + double zi = z->i; + r->r = z->r; + r->i = -zi; +} diff --git a/relapack/src/f2c.h b/relapack/src/f2c.h new file mode 100644 index 000000000..b94ee7c8e --- /dev/null +++ b/relapack/src/f2c.h @@ -0,0 +1,223 @@ +/* f2c.h -- Standard Fortran to C header file */ + +/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." + + - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ + +#ifndef F2C_INCLUDE +#define F2C_INCLUDE + +typedef long int integer; +typedef unsigned long int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +typedef long int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; +#ifdef INTEGER_STAR_8 /* Adjust for integer*8. */ +typedef long long longint; /* system-dependent */ +typedef unsigned long long ulongint; /* system-dependent */ +#define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) +#define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) +#endif + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +#ifdef f2c_i2 +/* for -i2 */ +typedef short flag; +typedef short ftnlen; +typedef short ftnint; +#else +typedef long int flag; +typedef long int ftnlen; +typedef long int ftnint; +#endif + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +/*typedef long int Long;*/ /* No longer used; formerly in Namelist */ + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (doublereal)abs(x) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (doublereal)min(a,b) +#define dmax(a,b) (doublereal)max(a,b) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef int /* Unknown procedure type */ (*U_fp)(...); +typedef shortint (*J_fp)(...); +typedef integer (*I_fp)(...); +typedef real (*R_fp)(...); +typedef doublereal (*D_fp)(...), (*E_fp)(...); +typedef /* Complex */ VOID (*C_fp)(...); +typedef /* Double Complex */ VOID (*Z_fp)(...); +typedef logical (*L_fp)(...); +typedef shortlogical (*K_fp)(...); +typedef /* Character */ VOID (*H_fp)(...); +typedef /* Subroutine */ int (*S_fp)(...); +#else +typedef int /* Unknown procedure type */ (*U_fp)(); +typedef shortint (*J_fp)(); +typedef integer (*I_fp)(); +typedef real (*R_fp)(); +typedef doublereal (*D_fp)(), (*E_fp)(); +typedef /* Complex */ VOID (*C_fp)(); +typedef /* Double Complex */ VOID (*Z_fp)(); +typedef logical (*L_fp)(); +typedef shortlogical (*K_fp)(); +typedef /* Character */ VOID (*H_fp)(); +typedef /* Subroutine */ int (*S_fp)(); +#endif +/* E_fp is for real functions when -R is not specified */ +typedef VOID C_f; /* complex function */ +typedef VOID H_f; /* character function */ +typedef VOID Z_f; /* double complex function */ +typedef doublereal E_f; /* real function with -R not specified */ + +/* undef any lower-case symbols that your C compiler predefines, e.g.: */ + +#ifndef Skip_f2c_Undefs +#undef cray +#undef gcos +#undef mc68010 +#undef mc68020 +#undef mips +#undef pdp11 +#undef sgi +#undef sparc +#undef sun +#undef sun2 +#undef sun3 +#undef sun4 +#undef u370 +#undef u3b +#undef u3b2 +#undef u3b5 +#undef unix +#undef vax +#endif +#endif diff --git a/relapack/src/lapack.h b/relapack/src/lapack.h new file mode 100644 index 000000000..064276b7e --- /dev/null +++ b/relapack/src/lapack.h @@ -0,0 +1,80 @@ +#ifndef LAPACK_H +#define LAPACK_H + +extern int LAPACK(lsame)(const char *, const char *); +extern int LAPACK(xerbla)(const char *, const int *); + +extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); + +extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); +extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); +extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); +extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); + +extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); +extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); +extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); +extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); + +extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); +extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); + +extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *); +extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *); + +extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); + +extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *); +extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *); +extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *); +extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *); + +extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *); +extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *); + +extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *); +extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *); + +extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *); + +extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *); +extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *); + +extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); + +extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *); +extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *); +extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *); +extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *); + +#endif /* LAPACK_H */ diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c new file mode 100644 index 000000000..488547260 --- /dev/null +++ b/relapack/src/lapack_wrappers.c @@ -0,0 +1,607 @@ +#include "relapack.h" + +//////////// +// XLAUUM // +//////////// + +#if INCLUDE_SLAUUM +void LAPACK(slauum)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_slauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DLAUUM +void LAPACK(dlauum)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dlauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CLAUUM +void LAPACK(clauum)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_clauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZLAUUM +void LAPACK(zlauum)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_zlauum(uplo, n, A, ldA, info); +} +#endif + + +//////////// +// XSYGST // +//////////// + +#if INCLUDE_SSYGST +void LAPACK(ssygst)( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_DSYGST +void LAPACK(dsygst)( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_CHEGST +void LAPACK(chegst)( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_ZHEGST +void LAPACK(zhegst)( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + + +//////////// +// XTRTRI // +//////////// + +#if INCLUDE_STRTRI +void LAPACK(strtri)( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_strtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_DTRTRI +void LAPACK(dtrtri)( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_CTRTRI +void LAPACK(ctrtri)( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZTRTRI +void LAPACK(ztrtri)( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); +} +#endif + + +//////////// +// XPOTRF // +//////////// + +#if INCLUDE_SPOTRF +void LAPACK(spotrf)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_spotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DPOTRF +void LAPACK(dpotrf)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dpotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CPOTRF +void LAPACK(cpotrf)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_cpotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZPOTRF +void LAPACK(zpotrf)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_zpotrf(uplo, n, A, ldA, info); +} +#endif + + +//////////// +// XPBTRF // +//////////// + +#if INCLUDE_SPBTRF +void LAPACK(spbtrf)( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_DPBTRF +void LAPACK(dpbtrf)( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_CPBTRF +void LAPACK(cpbtrf)( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_ZPBTRF +void LAPACK(zpbtrf)( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + + +//////////// +// XSYTRF // +//////////// + +#if INCLUDE_SSYTRF +void LAPACK(ssytrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_DSYTRF +void LAPACK(dsytrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CSYTRF +void LAPACK(csytrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZSYTRF +void LAPACK(zsytrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CHETRF +void LAPACK(chetrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZHETRF +void LAPACK(zhetrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_SSYTRF_ROOK +void LAPACK(ssytrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_DSYTRF_ROOK +void LAPACK(dsytrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CSYTRF_ROOK +void LAPACK(csytrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZSYTRF_ROOK +void LAPACK(zsytrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CHETRF_ROOK +void LAPACK(chetrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZHETRF_ROOK +void LAPACK(zhetrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + + +//////////// +// XGETRF // +//////////// + +#if INCLUDE_SGETRF +void LAPACK(sgetrf)( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_DGETRF +void LAPACK(dgetrf)( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_CGETRF +void LAPACK(cgetrf)( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_ZGETRF +void LAPACK(zgetrf)( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); +} +#endif + + +//////////// +// XGBTRF // +//////////// + +#if INCLUDE_SGBTRF +void LAPACK(sgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_DGBTRF +void LAPACK(dgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_CGBTRF +void LAPACK(cgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_ZGBTRF +void LAPACK(zgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + + +//////////// +// XTRSYL // +//////////// + +#if INCLUDE_STRSYL +void LAPACK(strsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_DTRSYL +void LAPACK(dtrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_CTRSYL +void LAPACK(ctrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_ZTRSYL +void LAPACK(ztrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + + +//////////// +// XTGSYL // +//////////// + +#if INCLUDE_STGSYL +void LAPACK(stgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_DTGSYL +void LAPACK(dtgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_CTGSYL +void LAPACK(ctgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_ZTGSYL +void LAPACK(ztgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + + +//////////// +// XGEMMT // +//////////// + +#if INCLUDE_SGEMMT +void LAPACK(sgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + RELAPACK_sgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DGEMMT +void LAPACK(dgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + RELAPACK_dgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CGEMMT +void LAPACK(cgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + RELAPACK_cgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZGEMMT +void LAPACK(zgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + RELAPACK_zgemmt(uplo, n, A, ldA, info); +} +#endif diff --git a/relapack/src/lapack_wrappers.c.orig b/relapack/src/lapack_wrappers.c.orig new file mode 100644 index 000000000..d89d2fe2f --- /dev/null +++ b/relapack/src/lapack_wrappers.c.orig @@ -0,0 +1,607 @@ +#include "relapack.h" + +//////////// +// XLAUUM // +//////////// + +#if INCLUDE_SLAUUM +void LAPACK(slauum)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_slauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DLAUUM +void LAPACK(dlauum)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dlauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CLAUUM +void LAPACK(clauum)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_clauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZLAUUM +void LAPACK(zlauum)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_zlauum(uplo, n, A, ldA, info); +} +#endif + + +//////////// +// XSYGST // +//////////// + +#if INCLUDE_SSYGST +void LAPACK(ssygst)( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_DSYGST +void LAPACK(dsygst)( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_CSYGST +void LAPACK(csygst)( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_ZSYGST +void LAPACK(zsygst)( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + + +//////////// +// XTRTRI // +//////////// + +#if INCLUDE_STRTRI +void LAPACK(strtri)( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_strtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_DTRTRI +void LAPACK(dtrtri)( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_CTRTRI +void LAPACK(ctrtri)( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZTRTRI +void LAPACK(ztrtri)( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); +} +#endif + + +//////////// +// XPOTRF // +//////////// + +#if INCLUDE_SPOTRF +void LAPACK(spotrf)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_spotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DPOTRF +void LAPACK(dpotrf)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dpotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CPOTRF +void LAPACK(cpotrf)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_cpotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZPOTRF +void LAPACK(zpotrf)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_zpotrf(uplo, n, A, ldA, info); +} +#endif + + +//////////// +// XPBTRF // +//////////// + +#if INCLUDE_SPBTRF +void LAPACK(spbtrf)( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_DPBTRF +void LAPACK(dpbtrf)( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_CPBTRF +void LAPACK(cpbtrf)( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_ZPBTRF +void LAPACK(zpbtrf)( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + + +//////////// +// XSYTRF // +//////////// + +#if INCLUDE_SSYTRF +void LAPACK(ssytrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_DSYTRF +void LAPACK(dsytrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CSYTRF +void LAPACK(csytrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZSYTRF +void LAPACK(zsytrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CHETRF +void LAPACK(chetrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZHETRF +void LAPACK(zhetrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_SSYTRF_ROOK +void LAPACK(ssytrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_DSYTRF_ROOK +void LAPACK(dsytrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CSYTRF_ROOK +void LAPACK(csytrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZSYTRF_ROOK +void LAPACK(zsytrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CHETRF_ROOK +void LAPACK(chetrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZHETRF_ROOK +void LAPACK(zhetrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + + +//////////// +// XGETRF // +//////////// + +#if INCLUDE_SGETRF +void LAPACK(sgetrf)( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_DGETRF +void LAPACK(dgetrf)( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_CGETRF +void LAPACK(cgetrf)( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_ZGETRF +void LAPACK(zgetrf)( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); +} +#endif + + +//////////// +// XGBTRF // +//////////// + +#if INCLUDE_SGBTRF +void LAPACK(sgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_DGBTRF +void LAPACK(dgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_CGBTRF +void LAPACK(cgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_ZGBTRF +void LAPACK(zgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + + +//////////// +// XTRSYL // +//////////// + +#if INCLUDE_STRSYL +void LAPACK(strsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_DTRSYL +void LAPACK(dtrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_CTRSYL +void LAPACK(ctrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_ZTRSYL +void LAPACK(ztrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + + +//////////// +// XTGSYL // +//////////// + +#if INCLUDE_STGSYL +void LAPACK(stgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_DTGSYL +void LAPACK(dtgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_CTGSYL +void LAPACK(ctgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_ZTGSYL +void LAPACK(ztgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + + +//////////// +// XGEMMT // +//////////// + +#if INCLUDE_SGEMMT +void LAPACK(sgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + RELAPACK_sgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DGEMMT +void LAPACK(dgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + RELAPACK_dgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CGEMMT +void LAPACK(cgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + RELAPACK_cgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZGEMMT +void LAPACK(zgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + RELAPACK_zgemmt(uplo, n, A, ldA, info); +} +#endif diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h new file mode 100644 index 000000000..2cb061c32 --- /dev/null +++ b/relapack/src/relapack.h @@ -0,0 +1,60 @@ +#ifndef RELAPACK_INT_H +#define RELAPACK_INT_H + +#include "../config.h" + +#include "../inc/relapack.h" + +// add an underscore to BLAS routines (or not) +#if BLAS_UNDERSCORE +#define BLAS(routine) routine ## _ +#else +#define BLAS(routine) routine +#endif + +// add an underscore to LAPACK routines (or not) +#if LAPACK_UNDERSCORE +#define LAPACK(routine) routine ## _ +#else +#define LAPACK(routine) routine +#endif + +// minimum and maximum macros +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +// REC_SPLIT(n) returns how a problem of size n is split recursively. +// If n >= 16, we ensure that the size of at least one of the halves is +// divisible by 8 (the cache line size in most CPUs), while both halves are +// still as close as possible in size. +// If n < 16 the problem is simply split in the middle. (Note that the +// crossoversize is usually larger than 16.) +#define SREC_SPLIT(n) ((n >= 32) ? ((n + 16) / 32) * 16 : n / 2) +#define DREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2) +#define CREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2) +#define ZREC_SPLIT(n) ((n >= 8) ? ((n + 4) / 8) * 4 : n / 2) + +#include "lapack.h" +#include "blas.h" + +// sytrf helper routines +void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); + +// trsyl helper routines +void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); + +#endif /* RELAPACK_INT_H */ diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c new file mode 100644 index 000000000..bc20e744b --- /dev/null +++ b/relapack/src/sgbtrf.c @@ -0,0 +1,227 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *, + const int *, float *, const int *, int *, float *, const int *, float *, + const int *, int *); + + +/** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's sgbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html + * */ +void RELAPACK_sgbtrf( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kl < 0) + *info = -3; + else if (*ku < 0) + *info = -4; + else if (*ldAb < 2 * *kl + *ku + 1) + *info = -6; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SGBTRF", &minfo); + return; + } + + // Constant + const float ZERO[] = { 0. }; + + // Result upper band width + const int kv = *ku + *kl; + + // Unskewg A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + kv; + + // Zero upper diagonal fill-in elements + int i, j; + for (j = 0; j < *n; j++) { + float *const A_j = A + *ldA * j; + for (i = MAX(0, j - kv); i < j - *ku; i++) + A_j[i] = 0.; + } + + // Allocate work space + const int n1 = SREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); + float *Worku = malloc(mWorku * nWorku * sizeof(float)); + LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); + LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + + // Recursive kernel + RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); + + // Free work space + free(Workl); + free(Worku); +} + + +/** sgbtrf's recursive compute kernel */ +static void RELAPACK_sgbtrf_rec( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, + int *info +) { + + if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { + // Unblocked + LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Loop iterators + int i, j; + + // Output upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + kv; + + // Splitting + const int n1 = MIN(SREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); + + // Ab_L * + // Ab_BR + float *const Ab_L = Ab; + float *const Ab_BR = Ab + *ldAb * n1; + + // A_L A_R + float *const A_L = A; + float *const A_R = A + *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + m1; + float *const A_BR = A + *ldA * n1 + m1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // Banded splitting + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); + + // n1 n21 n22 + // m * A_Rl ARr + float *const A_Rl = A_R; + float *const A_Rr = A_R + *ldA * n21; + + // n1 n21 n22 + // m1 * A_TRl A_TRr + // m21 A_BLt A_BRtl A_BRtr + // m22 A_BLb A_BRbl A_BRbr + float *const A_TRl = A_TR; + float *const A_TRr = A_TR + *ldA * n21; + float *const A_BLt = A_BL; + float *const A_BLb = A_BL + m21; + float *const A_BRtl = A_BR; + float *const A_BRtr = A_BR + *ldA * n21; + float *const A_BRbl = A_BR + m21; + float *const A_BRbr = A_BR + *ldA * n21 + m21; + + // recursion(Ab_L, ipiv_T) + RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); + + // Workl = A_BLb + LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); + + // partially redo swaps in A_L + for (i = 0; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); + else + BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); + } + } + + // apply pivots to A_Rl + LAPACK(slaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); + + // apply pivots to A_Rr columnwise + for (j = 0; j < n22; j++) { + float *const A_Rrj = A_Rr + *ldA * j; + for (i = j; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + const float tmp = A_Rrj[i]; + A_Rrj[i] = A_Rr[ip]; + A_Rrj[ip] = tmp; + } + } + } + + // A_TRl = A_TL \ A_TRl + BLAS(strsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // Worku = A_TRr + LAPACK(slacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); + // Worku = A_TL \ Worku + BLAS(strsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); + // A_TRr = Worku + LAPACK(slacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); + // A_BRtl = A_BRtl - A_BLt * A_TRl + BLAS(sgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); + // A_BRbl = A_BRbl - Workl * A_TRl + BLAS(sgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); + // A_BRtr = A_BRtr - A_BLt * Worku + BLAS(sgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Workl * Worku + BLAS(sgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); + + // partially undo swaps in A_L + for (i = mn1 - 1; i >= 0; i--) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); + else + BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); + } + } + + // recursion(Ab_BR, ipiv_B) + RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + if (*info) + *info += n1; + // shift pivots + for (i = 0; i < mn2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/sgemmt.c b/relapack/src/sgemmt.c new file mode 100644 index 000000000..75f78fabd --- /dev/null +++ b/relapack/src/sgemmt.c @@ -0,0 +1,165 @@ +#include "relapack.h" + +static void RELAPACK_sgemmt_rec(const char *, const char *, const char *, + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); + +static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *, + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); + + +/** SGEMMT computes a matrix-matrix product with general matrices but updates + * only the upper or lower triangular part of the result matrix. + * + * This routine performs the same operation as the BLAS routine + * sgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) + * but only updates the triangular part of C specified by uplo: + * If (*uplo == 'L'), only the lower triangular part of C is updated, + * otherwise the upper triangular part is updated. + * */ +void RELAPACK_sgemmt( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + +#if HAVE_XGEMMT + BLAS(sgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; +#else + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + int info = 0; + if (!lower && !upper) + info = 1; + else if (!tranA && !notransA) + info = 2; + else if (!tranB && !notransB) + info = 3; + else if (*n < 0) + info = 4; + else if (*k < 0) + info = 5; + else if (*ldA < MAX(1, notransA ? *n : *k)) + info = 8; + else if (*ldB < MAX(1, notransB ? *k : *n)) + info = 10; + else if (*ldC < MAX(1, *n)) + info = 13; + if (info) { + LAPACK(xerbla)("SGEMMT", &info); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleantransA = notransA ? 'N' : 'T'; + const char cleantransB = notransB ? 'N' : 'T'; + + // Recursive kernel + RELAPACK_sgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); +#endif +} + + +/** sgemmt's recursive compute kernel */ +static void RELAPACK_sgemmt_rec( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + + if (*n <= MAX(CROSSOVER_SGEMMT, 1)) { + // Unblocked + RELAPACK_sgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; + } + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_T + // A_B + const float *const A_T = A; + const float *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1); + + // B_L B_R + const float *const B_L = B; + const float *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1); + + // C_TL C_TR + // C_BL C_BR + float *const C_TL = C; + float *const C_TR = C + *ldC * n1; + float *const C_BL = C + n1; + float *const C_BR = C + *ldC * n1 + n1; + + // recursion(C_TL) + RELAPACK_sgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); + + if (*uplo == 'L') + // C_BL = alpha A_B B_L + beta C_BL + BLAS(sgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); + else + // C_TR = alpha A_T B_R + beta C_TR + BLAS(sgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); + + // recursion(C_BR) + RELAPACK_sgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); +} + + +/** sgemmt's unblocked compute kernel */ +static void RELAPACK_sgemmt_rec2( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; + + int i; + for (i = 0; i < *n; i++) { + // A_0 + // A_i + const float *const A_0 = A; + const float *const A_i = A + ((*transA == 'N') ? i : *ldA * i); + + // * B_i * + const float *const B_i = B + ((*transB == 'N') ? *ldB * i : i); + + // * C_0i * + // * C_ii * + float *const C_0i = C + *ldC * i; + float *const C_ii = C + *ldC * i + i; + + if (*uplo == 'L') { + const int nmi = *n - i; + if (*transA == 'N') + BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + else + BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + } else { + const int ip1 = i + 1; + if (*transA == 'N') + BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + else + BLAS(sgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + } + } +} diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c new file mode 100644 index 000000000..284f8cff6 --- /dev/null +++ b/relapack/src/sgetrf.c @@ -0,0 +1,117 @@ +#include "relapack.h" + +static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *, + int *, int *); + + +/** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's sgetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html + * */ +void RELAPACK_sgetrf( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SGETRF", &minfo); + return; + } + + const int sn = MIN(*m, *n); + + RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); + + // Right remainder + if (*m < *n) { + // Constants + const float ONE[] = { 1. }; + const int iONE[] = { 1. }; + + // Splitting + const int rn = *n - *m; + + // A_L A_R + const float *const A_L = A; + float *const A_R = A + *ldA * *m; + + // A_R = apply(ipiv, A_R) + LAPACK(slaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); + // A_R = A_L \ A_R + BLAS(strsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); + } +} + + +/** sgetrf's recursive compute kernel */ +static void RELAPACK_sgetrf_rec( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + + if (*n <= MAX(CROSSOVER_SGETRF, 1)) { + // Unblocked + LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; + + // A_L A_R + float *const A_L = A; + float *const A_R = A + *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // recursion(A_L, ipiv_T) + RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + // apply pivots to A_R + LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); + + // A_TR = A_TL \ A_TR + BLAS(strsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_BL * A_TR + BLAS(sgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); + + // recursion(A_BR, ipiv_B) + RELAPACK_sgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); + if (*info) + *info += n1; + // apply pivots to A_BL + LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); + // shift pivots + int i; + for (i = 0; i < n2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/slauum.c b/relapack/src/slauum.c new file mode 100644 index 000000000..280f141b3 --- /dev/null +++ b/relapack/src/slauum.c @@ -0,0 +1,87 @@ +#include "relapack.h" + +static void RELAPACK_slauum_rec(const char *, const int *, float *, + const int *, int *); + + +/** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. + * + * This routine is functionally equivalent to LAPACK's slauum. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html + * */ +void RELAPACK_slauum( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SLAUUM", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_slauum_rec(&cleanuplo, n, A, ldA, info); +} + + +/** slauum's recursive compute kernel */ +static void RELAPACK_slauum_rec( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_SLAUUM, 1)) { + // Unblocked + LAPACK(slauu2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_slauum_rec(uplo, &n1, A_TL, ldA, info); + + if (*uplo == 'L') { + // A_TL = A_TL + A_BL' * A_BL + BLAS(ssyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); + // A_BL = A_BR' * A_BL + BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TL = A_TL + A_TR * A_TR' + BLAS(ssyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); + // A_TR = A_TR * A_BR' + BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_slauum_rec(uplo, &n2, A_BR, ldA, info); +} diff --git a/relapack/src/spbtrf.c b/relapack/src/spbtrf.c new file mode 100644 index 000000000..ee0a5546e --- /dev/null +++ b/relapack/src/spbtrf.c @@ -0,0 +1,157 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_spbtrf_rec(const char *, const int *, const int *, + float *, const int *, float *, const int *, int *); + + +/** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. + * + * This routine is functionally equivalent to LAPACK's spbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html + * */ +void RELAPACK_spbtrf( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kd < 0) + *info = -3; + else if (*ldAb < *kd + 1) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SPBTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Constant + const float ZERO[] = { 0. }; + + // Allocate work space + const int n1 = SREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + float *Work = malloc(mWork * nWork * sizeof(float)); + LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); + + // Recursive kernel + RELAPACK_spbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); + + // Free work space + free(Work); +} + + +/** spbtrf's recursive compute kernel */ +static void RELAPACK_spbtrf_rec( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + float *Work, const int *ldWork, + int *info +){ + + if (*n <= MAX(CROSSOVER_SPBTRF, 1)) { + // Unblocked + LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + ((*uplo == 'L') ? 0 : *kd); + + // Splitting + const int n1 = MIN(SREC_SPLIT(*n), *kd); + const int n2 = *n - n1; + + // * * + // * Ab_BR + float *const Ab_BR = Ab + *ldAb * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_spotrf(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + // Banded splitting + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, *kd); + + // n1 n21 n22 + // n1 * A_TRl A_TRr + // n21 A_BLt A_BRtl A_BRtr + // n22 A_BLb A_BRbl A_BRbr + float *const A_TRl = A_TR; + float *const A_TRr = A_TR + *ldA * n21; + float *const A_BLt = A_BL; + float *const A_BLb = A_BL + n21; + float *const A_BRtl = A_BR; + float *const A_BRtr = A_BR + *ldA * n21; + float *const A_BRbl = A_BR + n21; + float *const A_BRbr = A_BR + *ldA * n21 + n21; + + if (*uplo == 'L') { + // A_BLt = ABLt / A_TL' + BLAS(strsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); + // A_BRtl = A_BRtl - A_BLt * A_BLt' + BLAS(ssyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); + // Work = A_BLb + LAPACK(slacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); + // Work = Work / A_TL' + BLAS(strsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); + // A_BRbl = A_BRbl - Work * A_BLt' + BLAS(sgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); + // A_BRbr = A_BRbr - Work * Work' + BLAS(ssyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_BLb = Work + LAPACK(slacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); + } else { + // A_TRl = A_TL' \ A_TRl + BLAS(strsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // A_BRtl = A_BRtl - A_TRl' * A_TRl + BLAS(ssyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); + // Work = A_TRr + LAPACK(slacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); + // Work = A_TL' \ Work + BLAS(strsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); + // A_BRtr = A_BRtr - A_TRl' * Work + BLAS(sgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Work' * Work + BLAS(ssyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_TRr = Work + LAPACK(slacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); + } + + // recursion(A_BR) + if (*kd > n1) + RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info); + else + RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/spotrf.c b/relapack/src/spotrf.c new file mode 100644 index 000000000..2a609321b --- /dev/null +++ b/relapack/src/spotrf.c @@ -0,0 +1,92 @@ +#include "relapack.h" + +static void RELAPACK_spotrf_rec(const char *, const int *, float *, + const int *, int *); + + +/** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. + * + * This routine is functionally equivalent to LAPACK's spotrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html + * */ +void RELAPACK_spotrf( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SPOTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_spotrf_rec(&cleanuplo, n, A, ldA, info); +} + + +/** spotrf's recursive compute kernel */ +static void RELAPACK_spotrf_rec( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_SPOTRF, 1)) { + // Unblocked + LAPACK(spotf2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_spotrf_rec(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = A_BL / A_TL' + BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); + // A_BR = A_BR - A_BL * A_BL' + BLAS(ssyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); + } else { + // A_TR = A_TL' \ A_TR + BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_TR' * A_TR + BLAS(ssyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); + } + + // recursion(A_BR) + RELAPACK_spotrf_rec(uplo, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/ssygst.c b/relapack/src/ssygst.c new file mode 100644 index 000000000..7f145cdec --- /dev/null +++ b/relapack/src/ssygst.c @@ -0,0 +1,212 @@ +#include "relapack.h" +#if XSYGST_ALLOW_MALLOC +#include "stdlib.h" +#endif + +static void RELAPACK_ssygst_rec(const int *, const char *, const int *, + float *, const int *, const float *, const int *, + float *, const int *, int *); + + +/** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. + * + * This routine is functionally equivalent to LAPACK's ssygst. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html + * */ +void RELAPACK_ssygst( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (*itype < 1 || *itype > 3) + *info = -1; + else if (!lower && !upper) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + else if (*ldB < MAX(1, *n)) + *info = -7; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SSYGST", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Allocate work space + float *Work = NULL; + int lWork = 0; +#if XSYGST_ALLOW_MALLOC + const int n1 = SREC_SPLIT(*n); + lWork = n1 * (*n - n1); + Work = malloc(lWork * sizeof(float)); + if (!Work) + lWork = 0; +#endif + + // Recursive kernel + RELAPACK_ssygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); + + // Free work space +#if XSYGST_ALLOW_MALLOC + if (Work) + free(Work); +#endif +} + + +/** ssygst's recursive compute kernel */ +static void RELAPACK_ssygst_rec( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + float *Work, const int *lWork, int *info +) { + + if (*n <= MAX(CROSSOVER_SSYGST, 1)) { + // Unblocked + LAPACK(ssygs2)(itype, uplo, n, A, ldA, B, ldB, info); + return; + } + + // Constants + const float ZERO[] = { 0. }; + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const float HALF[] = { .5 }; + const float MHALF[] = { -.5 }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // B_TL B_TR + // B_BL B_BR + const float *const B_TL = B; + const float *const B_TR = B + *ldB * n1; + const float *const B_BL = B + n1; + const float *const B_BR = B + *ldB * n1 + n1; + + // recursion(A_TL, B_TL) + RELAPACK_ssygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); + + if (*itype == 1) + if (*uplo == 'L') { + // A_BL = A_BL / B_TL' + BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * B_BL * A_TL + BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + } else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' + BLAS(ssyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR \ A_BL + BLAS(strsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL' \ A_TR + BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * A_TL * B_TR + BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + } else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR + BLAS(ssyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR / B_BR + BLAS(strsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + else + if (*uplo == 'L') { + // A_BL = A_BL * B_TL + BLAS(strmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * A_BR * B_BL + BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + } else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL + BLAS(ssyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR * A_BL + BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL * A_TR + BLAS(strmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * B_TR * A_BR + BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + } else + // A_TR = A_TR + 1/2 B_TR A_BR + BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' + BLAS(ssyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + else + // A_TR = A_TR + 1/2 B_TR * A_BR + BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR * B_BR + BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + + // recursion(A_BR, B_BR) + RELAPACK_ssygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); +} diff --git a/relapack/src/ssytrf.c b/relapack/src/ssytrf.c new file mode 100644 index 000000000..8a4fad9f2 --- /dev/null +++ b/relapack/src/ssytrf.c @@ -0,0 +1,238 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's ssytrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html + * */ +void RELAPACK_ssytrf( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy arguments + int nout; + + // Recursive kernel + RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** ssytrf's recursive compute kernel */ +static void RELAPACK_ssytrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_SSYTRF, 3)) { + // Unblocked + if (top) { + LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_ssytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = SREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + float *const A_BL_B = A + *n; + float *const A_BR_B = A + *ldA * n1 + *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + n1; + float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = SREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + *ldA * n_rest; + float *const A_TR_T = A + *ldA * (n_rest + n1); + float *const A_TL = A + *ldA * n_rest + n_rest; + float *const A_TR = A + *ldA * (n_rest + n1) + n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/ssytrf_rec2.c b/relapack/src/ssytrf_rec2.c new file mode 100644 index 000000000..edc9269ec --- /dev/null +++ b/relapack/src/ssytrf_rec2.c @@ -0,0 +1,351 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static float c_b8 = -1.f; +static float c_b9 = 1.f; + +/** SSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method. + * + * This routine is a minor modification of LAPACK's slasyf. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int * + nb, int *kb, float *a, int *lda, int *ipiv, float *w, + int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + float r__1, r__2, r__3; + + /* Builtin functions */ + double sqrt(double); + + /* Local variables */ + static int j, k; + static float t, r1, d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static float alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int sscal_(int *, float *, float *, int *), + sgemv_(char *, int *, int *, float *, float *, int *, + float *, int *, float *, float *, int *, ftnlen); + static int kstep; + extern /* Subroutine */ int scopy_(int *, float *, int *, float *, + int *), sswap_(int *, float *, int *, float *, int * + ); + static float absakk; + extern int isamax_(int *, float *, int *); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + kstep = 1; + absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1)); + if (k > 1) { + i__1 = k - 1; + imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + i__1 = k - imax; + jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1)); + if (imax > 1) { + i__1 = imax - 1; + jmax = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + r__2 = rowmax, r__3 = (r__1 = w[jmax + (kw - 1) * w_dim1], + dabs(r__1)); + rowmax = dmax(r__2,r__3); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else if ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) >= + alpha * rowmax) { + kp = imax; + scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; + i__1 = kk - 1 - kp; + scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + scopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + sswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + r1 = 1.f / a[k + k * a_dim1]; + i__1 = k - 1; + sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + if (k > 2) { + d21 = w[k - 1 + kw * w_dim1]; + d11 = w[k + kw * w_dim1] / d21; + d22 = w[k - 1 + (kw - 1) * w_dim1] / d21; + t = 1.f / (d11 * d22 - 1.f); + d21 = t / d21; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1) + * w_dim1] - w[j + kw * w_dim1]); + a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] - + w[j + (kw - 1) * w_dim1]); +/* L20: */ + } + } + a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; + a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; + a[k + k * a_dim1] = w[k + kw * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + sswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + i__1 = *n - k + 1; + scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12); + kstep = 1; + absakk = (r__1 = w[k + k * w_dim1], dabs(r__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax - k; + jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + r__2 = rowmax, r__3 = (r__1 = w[jmax + (k + 1) * w_dim1], + dabs(r__1)); + rowmax = dmax(r__2,r__3); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else if ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) >= + alpha * rowmax) { + kp = imax; + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + kk = k + kstep - 1; + if (kp != kk) { + a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; + i__1 = kp - kk - 1; + scopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + sswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + r1 = 1.f / a[k + k * a_dim1]; + i__1 = *n - k; + sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } + } else { + if (k < *n - 1) { + d21 = w[k + 1 + k * w_dim1]; + d11 = w[k + 1 + (k + 1) * w_dim1] / d21; + d22 = w[k + k * w_dim1] / d21; + t = 1.f / (d11 * d22 - 1.f); + d21 = t / d21; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] - + w[j + (k + 1) * w_dim1]); + a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) * + w_dim1] - w[j + k * w_dim1]); +/* L80: */ + } + } + a[k + k * a_dim1] = w[k + k * w_dim1]; + a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; + a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + sswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/ssytrf_rook.c b/relapack/src/ssytrf_rook.c new file mode 100644 index 000000000..040df2484 --- /dev/null +++ b/relapack/src/ssytrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's ssytrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html + * */ +void RELAPACK_ssytrf_rook( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** ssytrf_rook's recursive compute kernel */ +static void RELAPACK_ssytrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) { + // Unblocked + if (top) { + LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_ssytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = SREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + float *const A_BL_B = A + *n; + float *const A_BR_B = A + *ldA * n1 + *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + n1; + float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = SREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + *ldA * n_rest; + float *const A_TR_T = A + *ldA * (n_rest + n1); + float *const A_TL = A + *ldA * n_rest + n_rest; + float *const A_TR = A + *ldA * (n_rest + n1) + n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/ssytrf_rook_rec2.c b/relapack/src/ssytrf_rook_rec2.c new file mode 100644 index 000000000..3308826d7 --- /dev/null +++ b/relapack/src/ssytrf_rook_rec2.c @@ -0,0 +1,451 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static float c_b9 = -1.f; +static float c_b10 = 1.f; + +/** SSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method. + * + * This routine is a minor modification of LAPACK's slasyf_rook. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, float *a, int *lda, int *ipiv, float * + w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + float r__1; + + /* Builtin functions */ + double sqrt(double); + + /* Local variables */ + static int j, k, p; + static float t, r1, d11, d12, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static float alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int sscal_(int *, float *, float *, int *); + static float sfmin; + static int itemp; + extern /* Subroutine */ int sgemv_(char *, int *, int *, float *, + float *, int *, float *, int *, float *, float *, int *, + ftnlen); + static int kstep; + static float stemp; + extern /* Subroutine */ int scopy_(int *, float *, int *, float *, + int *), sswap_(int *, float *, int *, float *, int * + ); + static float absakk; + extern double slamch_(char *, ftnlen); + extern int isamax_(int *, float *, int *); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + sfmin = slamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1)); + if (k > 1) { + i__1 = k - 1; + imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1)); + } else { + rowmax = 0.f; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + stemp = (r__1 = w[itemp + (kw - 1) * w_dim1], dabs(r__1)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + if (! ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) < + alpha * rowmax)) { + kp = imax; + scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = k - p; + scopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + scopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & + c__1); + i__1 = *n - k + 1; + sswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], + lda); + i__1 = *n - kk + 1; + sswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + a[kp + k * a_dim1] = a[kk + k * a_dim1]; + i__1 = k - 1 - kp; + scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + scopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & + c__1); + i__1 = *n - kk + 1; + sswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], + lda); + i__1 = *n - kk + 1; + sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) { + r1 = 1.f / a[k + k * a_dim1]; + i__1 = k - 1; + sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else if (a[k + k * a_dim1] != 0.f) { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + a[ii + k * a_dim1] /= a[k + k * a_dim1]; +/* L14: */ + } + } + } + } else { + if (k > 2) { + d12 = w[k - 1 + kw * w_dim1]; + d11 = w[k + kw * w_dim1] / d12; + d22 = w[k - 1 + (kw - 1) * w_dim1] / d12; + t = 1.f / (d11 * d22 - 1.f); + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) * + w_dim1] - w[j + kw * w_dim1]) / d12); + a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] - + w[j + (kw - 1) * w_dim1]) / d12); +/* L20: */ + } + } + a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; + a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; + a[k + k * a_dim1] = w[k + kw * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + sswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + jj = j - 1; + if (jp1 != jj && kstep == 2) { + i__1 = *n - j + 1; + sswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j <= *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = *n - k + 1; + scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + } + absakk = (r__1 = w[k + k * w_dim1], dabs(r__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1)); + } else { + rowmax = 0.f; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + stemp = (r__1 = w[itemp + (k + 1) * w_dim1], dabs(r__1)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + if (! ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) < + alpha * rowmax)) { + kp = imax; + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p - k; + scopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], + lda); + i__1 = *n - p + 1; + scopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & + c__1); + sswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + sswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + a[kp + k * a_dim1] = a[kk + k * a_dim1]; + i__1 = kp - k - 1; + scopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) + * a_dim1], lda); + i__1 = *n - kp + 1; + scopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * + a_dim1], &c__1); + sswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) { + r1 = 1.f / a[k + k * a_dim1]; + i__1 = *n - k; + sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else if (a[k + k * a_dim1] != 0.f) { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + a[ii + k * a_dim1] /= a[k + k * a_dim1]; +/* L74: */ + } + } + } + } else { + if (k < *n - 1) { + d21 = w[k + 1 + k * w_dim1]; + d11 = w[k + 1 + (k + 1) * w_dim1] / d21; + d22 = w[k + k * w_dim1] / d21; + t = 1.f / (d11 * d22 - 1.f); + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[ + j + (k + 1) * w_dim1]) / d21); + a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) * + w_dim1] - w[j + k * w_dim1]) / d21); +/* L80: */ + } + } + a[k + k * a_dim1] = w[k + k * w_dim1]; + a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; + a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + sswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + jj = j + 1; + if (jp1 != jj && kstep == 2) { + sswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j >= 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/stgsyl.c b/relapack/src/stgsyl.c new file mode 100644 index 000000000..1870fb928 --- /dev/null +++ b/relapack/src/stgsyl.c @@ -0,0 +1,274 @@ +#include "relapack.h" +#include + +static void RELAPACK_stgsyl_rec(const char *, const int *, const int *, + const int *, const float *, const int *, const float *, const int *, + float *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, float *, float *, int *, int *, + int *); + + +/** STGSYL solves the generalized Sylvester equation. + * + * This routine is functionally equivalent to LAPACK's stgsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html + * */ +void RELAPACK_stgsyl( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + + // Parse arguments + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "T"); + + // Compute work buffer size + int lwmin = 1; + if (notran && (*ijob == 1 || *ijob == 2)) + lwmin = MAX(1, 2 * *m * *n); + *info = 0; + + // Check arguments + if (!tran && !notran) + *info = -1; + else if (notran && (*ijob < 0 || *ijob > 4)) + *info = -2; + else if (*m <= 0) + *info = -3; + else if (*n <= 0) + *info = -4; + else if (*ldA < MAX(1, *m)) + *info = -6; + else if (*ldB < MAX(1, *n)) + *info = -8; + else if (*ldC < MAX(1, *m)) + *info = -10; + else if (*ldD < MAX(1, *m)) + *info = -12; + else if (*ldE < MAX(1, *n)) + *info = -14; + else if (*ldF < MAX(1, *m)) + *info = -16; + else if (*lWork < lwmin && *lWork != -1) + *info = -20; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("STGSYL", &minfo); + return; + } + + if (*lWork == -1) { + // Work size query + *Work = lwmin; + return; + } + + // Clean char * arguments + const char cleantrans = notran ? 'N' : 'T'; + + // Constant + const float ZERO[] = { 0. }; + + int isolve = 1; + int ifunc = 0; + if (notran) { + if (*ijob >= 3) { + ifunc = *ijob - 2; + LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF); + } else if (*ijob >= 1) + isolve = 2; + } + + float scale2; + int iround; + for (iround = 1; iround <= isolve; iround++) { + *scale = 1; + float dscale = 0; + float dsum = 1; + int pq; + RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); + if (dscale != 0) { + if (*ijob == 1 || *ijob == 3) + *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); + else + *dif = sqrt(pq) / (dscale * sqrt(dsum)); + } + if (isolve == 2) { + if (iround == 1) { + if (notran) + ifunc = *ijob; + scale2 = *scale; + LAPACK(slacpy)("F", m, n, C, ldC, Work, m); + LAPACK(slacpy)("F", m, n, F, ldF, Work + *m * *n, m); + LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF); + } else { + LAPACK(slacpy)("F", m, n, Work, m, C, ldC); + LAPACK(slacpy)("F", m, n, Work + *m * *n, m, F, ldF); + *scale = scale2; + } + } + } +} + + +/** stgsyl's recursive vompute kernel */ +static void RELAPACK_stgsyl_rec( + const char *trans, const int *ifunc, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dsum, float *dscale, + int *iWork, int *pq, int *info +) { + + if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) { + // Unblocked + LAPACK(stgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Outputs + float scale1[] = { 1. }; + float scale2[] = { 1. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + int m1 = SREC_SPLIT(*m); + if (A[m1 + *ldA * (m1 - 1)]) + m1++; + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const float *const A_TL = A; + const float *const A_TR = A + *ldA * m1; + const float *const A_BR = A + *ldA * m1 + m1; + + // C_T + // C_B + float *const C_T = C; + float *const C_B = C + m1; + + // D_TL D_TR + // 0 D_BR + const float *const D_TL = D; + const float *const D_TR = D + *ldD * m1; + const float *const D_BR = D + *ldD * m1 + m1; + + // F_T + // F_B + float *const F_T = F; + float *const F_B = F + m1; + + if (*trans == 'N') { + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1); + // C_T = C_T - A_TR * C_B + BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // F_T = F_T - D_TR * C_B + BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); + } + } else { + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); + // C_B = C_B - A_TR^H * C_T + BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // C_B = C_B - D_TR^H * F_T + BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); + } + } + } else { + // Splitting + int n1 = SREC_SPLIT(*n); + if (B[n1 + *ldB * (n1 - 1)]) + n1++; + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const float *const B_TL = B; + const float *const B_TR = B + *ldB * n1; + const float *const B_BR = B + *ldB * n1 + n1; + + // C_L C_R + float *const C_L = C; + float *const C_R = C + *ldC * n1; + + // E_TL E_TR + // 0 E_BR + const float *const E_TL = E; + const float *const E_TR = E + *ldE * n1; + const float *const E_BR = E + *ldE * n1 + n1; + + // F_L F_R + float *const F_L = F; + float *const F_R = F + *ldF * n1; + + if (*trans == 'N') { + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1); + // C_R = C_R + F_L * B_TR + BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); + // F_R = F_R + F_L * E_TR + BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); + } + } else { + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); + // F_L = F_L + C_R * B_TR + BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); + // F_L = F_L + F_R * E_TR + BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); + } + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/strsyl.c b/relapack/src/strsyl.c new file mode 100644 index 000000000..83947ef1a --- /dev/null +++ b/relapack/src/strsyl.c @@ -0,0 +1,169 @@ +#include "relapack.h" + +static void RELAPACK_strsyl_rec(const char *, const char *, const int *, + const int *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, int *); + + +/** STRSYL solves the real Sylvester matrix equation. + * + * This routine is functionally equivalent to LAPACK's strsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html + * */ +void RELAPACK_strsyl( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + + // Check arguments + const int notransA = LAPACK(lsame)(tranA, "N"); + const int transA = LAPACK(lsame)(tranA, "T"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int transB = LAPACK(lsame)(tranB, "T"); + const int ctransB = LAPACK(lsame)(tranB, "C"); + *info = 0; + if (!transA && !ctransA && !notransA) + *info = -1; + else if (!transB && !ctransB && !notransB) + *info = -2; + else if (*isgn != 1 && *isgn != -1) + *info = -3; + else if (*m < 0) + *info = -4; + else if (*n < 0) + *info = -5; + else if (*ldA < MAX(1, *m)) + *info = -7; + else if (*ldB < MAX(1, *n)) + *info = -9; + else if (*ldC < MAX(1, *m)) + *info = -11; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("STRSYL", &minfo); + return; + } + + // Clean char * arguments + const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); + const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); + + // Recursive kernel + RELAPACK_strsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} + + +/** strsyl's recursive compute kernel */ +static void RELAPACK_strsyl_rec( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + + if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) { + // Unblocked + RELAPACK_strsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const float MSGN[] = { -*isgn }; + const int iONE[] = { 1 }; + + // Outputs + float scale1[] = { 1. }; + float scale2[] = { 1. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + int m1 = SREC_SPLIT(*m); + if (A[m1 + *ldA * (m1 - 1)]) + m1++; + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const float *const A_TL = A; + const float *const A_TR = A + *ldA * m1; + const float *const A_BR = A + *ldA * m1 + m1; + + // C_T + // C_B + float *const C_T = C; + float *const C_B = C + m1; + + if (*tranA == 'N') { + // recusion(A_BR, B, C_B) + RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); + // C_T = C_T - A_TR * C_B + BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // recusion(A_TL, B, C_T) + RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + } else { + // recusion(A_TL, B, C_T) + RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); + // C_B = C_B - A_TR' * C_T + BLAS(sgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // recusion(A_BR, B, C_B) + RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); + } + } else { + // Splitting + int n1 = SREC_SPLIT(*n); + if (B[n1 + *ldB * (n1 - 1)]) + n1++; + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const float *const B_TL = B; + const float *const B_TR = B + *ldB * n1; + const float *const B_BR = B + *ldB * n1 + n1; + + // C_L C_R + float *const C_L = C; + float *const C_R = C + *ldC * n1; + + if (*tranB == 'N') { + // recusion(A, B_TL, C_L) + RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); + // C_R = C_R -/+ C_L * B_TR + BLAS(sgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); + // recusion(A, B_BR, C_R) + RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + } else { + // recusion(A, B_BR, C_R) + RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); + // C_L = C_L -/+ C_R * B_TR' + BLAS(sgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); + // recusion(A, B_TL, C_L) + RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/strsyl_rec2.c b/relapack/src/strsyl_rec2.c new file mode 100644 index 000000000..6d40a475d --- /dev/null +++ b/relapack/src/strsyl_rec2.c @@ -0,0 +1,1029 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static int c_false = FALSE_; +static int c__2 = 2; +static float c_b26 = 1.f; +static float c_b30 = 0.f; +static int c_true = TRUE_; + +void RELAPACK_strsyl_rec2(char *trana, char *tranb, int *isgn, int + *m, int *n, float *a, int *lda, float *b, int *ldb, float * + c__, int *ldc, float *scale, int *info, ftnlen trana_len, + ftnlen tranb_len) +{ + /* System generated locals */ + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4; + float r__1, r__2; + + /* Local variables */ + static int j, k, l; + static float x[4] /* was [2][2] */; + static int k1, k2, l1, l2; + static float a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, sgn; + static int ierr; + static float smin; + extern float sdot_(int *, float *, int *, float *, int *); + static float suml, sumr; + extern int lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int sscal_(int *, float *, float *, int *); + static int knext, lnext; + static float xnorm; + extern /* Subroutine */ int slaln2_(int *, int *, int *, float + *, float *, float *, int *, float *, float *, float *, int *, + float *, float *, float *, int *, float *, float *, int *), + slasy2_(int *, int *, int *, int *, int *, + float *, int *, float *, int *, float *, int *, float *, + float *, int *, float *, int *), slabad_(float *, float *); + static float scaloc; + extern float slamch_(char *, ftnlen), slange_(char *, int *, + int *, float *, int *, float *, ftnlen); + extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + static float bignum; + static int notrna, notrnb; + static float smlnum; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1; + c__ -= c_offset; + + /* Function Body */ + notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); + notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); + *info = 0; + if (! notrna && ! lsame_(trana, "T", (ftnlen)1, (ftnlen)1) && ! lsame_( + trana, "C", (ftnlen)1, (ftnlen)1)) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T", (ftnlen)1, (ftnlen)1) && ! + lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < max(1,*m)) { + *info = -7; + } else if (*ldb < max(1,*n)) { + *info = -9; + } else if (*ldc < max(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("STRSYL", &i__1, (ftnlen)6); + return; + } + *scale = 1.f; + if (*m == 0 || *n == 0) { + return; + } + eps = slamch_("P", (ftnlen)1); + smlnum = slamch_("S", (ftnlen)1); + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); + smlnum = smlnum * (float) (*m * *n) / eps; + bignum = 1.f / smlnum; +/* Computing MAX */ + r__1 = smlnum, r__2 = eps * slange_("M", m, m, &a[a_offset], lda, dum, ( + ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * slange_("M", n, n, + &b[b_offset], ldb, dum, (ftnlen)1); + smin = dmax(r__1,r__2); + sgn = (float) (*isgn); + if (notrna && notrnb) { + lnext = 1; + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + if (l < lnext) { + goto L70; + } + if (l == *n) { + l1 = l; + l2 = l; + } else { + if (b[l + 1 + l * b_dim1] != 0.f) { + l1 = l; + l2 = l + 1; + lnext = l + 2; + } else { + l1 = l; + l2 = l; + lnext = l + 1; + } + } + knext = *m; + for (k = *m; k >= 1; --k) { + if (k > knext) { + goto L60; + } + if (k == 1) { + k1 = k; + k2 = k; + } else { + if (a[k + (k - 1) * a_dim1] != 0.f) { + k1 = k - 1; + k2 = k; + knext = k - 2; + } else { + k1 = k; + k2 = k; + knext = k - 1; + } + } + if (l1 == l2 && k1 == k2) { + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.f; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = dabs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = dabs(vec[0]); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L10: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + r__1 = -sgn * b[l1 + l1 * b_dim1]; + slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 + * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L20: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + r__1 = -sgn * a[k1 + k1 * a_dim1]; + slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 * + b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L40: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + slasy2_(&c_false, &c_false, isgn, &c__2, &c__2, &a[k1 + + k1 * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, + &c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L50: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L60: + ; + } +L70: + ; + } + } else if (! notrna && notrnb) { + lnext = 1; + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + if (l < lnext) { + goto L130; + } + if (l == *n) { + l1 = l; + l2 = l; + } else { + if (b[l + 1 + l * b_dim1] != 0.f) { + l1 = l; + l2 = l + 1; + lnext = l + 2; + } else { + l1 = l; + l2 = l; + lnext = l + 1; + } + } + knext = 1; + i__2 = *m; + for (k = 1; k <= i__2; ++k) { + if (k < knext) { + goto L120; + } + if (k == *m) { + k1 = k; + k2 = k; + } else { + if (a[k + 1 + k * a_dim1] != 0.f) { + k1 = k; + k2 = k + 1; + knext = k + 2; + } else { + k1 = k; + k2 = k; + knext = k + 1; + } + } + if (l1 == l2 && k1 == k2) { + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.f; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = dabs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = dabs(vec[0]); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.f) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L80: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + r__1 = -sgn * b[l1 + l1 * b_dim1]; + slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 * + a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L90: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + r__1 = -sgn * a[k1 + k1 * a_dim1]; + slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 * + b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L100: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + slasy2_(&c_true, &c_false, isgn, &c__2, &c__2, &a[k1 + k1 + * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L110: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L120: + ; + } +L130: + ; + } + } else if (! notrna && ! notrnb) { + lnext = *n; + for (l = *n; l >= 1; --l) { + if (l > lnext) { + goto L190; + } + if (l == 1) { + l1 = l; + l2 = l; + } else { + if (b[l + (l - 1) * b_dim1] != 0.f) { + l1 = l - 1; + l2 = l; + lnext = l - 2; + } else { + l1 = l; + l2 = l; + lnext = l - 1; + } + } + knext = 1; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + if (k < knext) { + goto L180; + } + if (k == *m) { + k1 = k; + k2 = k; + } else { + if (a[k + 1 + k * a_dim1] != 0.f) { + k1 = k; + k2 = k + 1; + knext = k + 2; + } else { + k1 = k; + k2 = k; + knext = k + 1; + } + } + if (l1 == l2 && k1 == k2) { + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l1; +/* Computing MIN */ + i__3 = l1 + 1; +/* Computing MIN */ + i__4 = l1 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.f; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = dabs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = dabs(vec[0]); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L140: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + r__1 = -sgn * b[l1 + l1 * b_dim1]; + slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 * + a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L150: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + r__1 = -sgn * a[k1 + k1 * a_dim1]; + slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 + * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L160: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + slasy2_(&c_true, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 * + a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L170: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L180: + ; + } +L190: + ; + } + } else if (notrna && ! notrnb) { + lnext = *n; + for (l = *n; l >= 1; --l) { + if (l > lnext) { + goto L250; + } + if (l == 1) { + l1 = l; + l2 = l; + } else { + if (b[l + (l - 1) * b_dim1] != 0.f) { + l1 = l - 1; + l2 = l; + lnext = l - 2; + } else { + l1 = l; + l2 = l; + lnext = l - 1; + } + } + knext = *m; + for (k = *m; k >= 1; --k) { + if (k > knext) { + goto L240; + } + if (k == 1) { + k1 = k; + k2 = k; + } else { + if (a[k + (k - 1) * a_dim1] != 0.f) { + k1 = k - 1; + k2 = k; + knext = k - 2; + } else { + k1 = k; + k2 = k; + knext = k - 1; + } + } + if (l1 == l2 && k1 == k2) { + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l1; +/* Computing MIN */ + i__2 = l1 + 1; +/* Computing MIN */ + i__3 = l1 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.f; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = dabs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = dabs(vec[0]); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.f) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L200: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + r__1 = -sgn * b[l1 + l1 * b_dim1]; + slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 + * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L210: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + r__1 = -sgn * a[k1 + k1 * a_dim1]; + slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 + * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L220: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + slasy2_(&c_false, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 + * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L230: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L240: + ; + } +L250: + ; + } + } +} diff --git a/relapack/src/strtri.c b/relapack/src/strtri.c new file mode 100644 index 000000000..d35bbd49f --- /dev/null +++ b/relapack/src/strtri.c @@ -0,0 +1,107 @@ +#include "relapack.h" + +static void RELAPACK_strtri_rec(const char *, const char *, const int *, + float *, const int *, int *); + + +/** CTRTRI computes the inverse of a real upper or lower triangular matrix A. + * + * This routine is functionally equivalent to LAPACK's strtri. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html + * */ +void RELAPACK_strtri( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (!nounit && !unit) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("STRTRI", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleandiag = nounit ? 'N' : 'U'; + + // check for singularity + if (nounit) { + int i; + for (i = 0; i < *n; i++) + if (A[i + *ldA * i] == 0) { + *info = i; + return; + } + } + + // Recursive kernel + RELAPACK_strtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); +} + + +/** strtri's recursive compute kernel */ +static void RELAPACK_strtri_rec( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_STRTRI, 1)) { + // Unblocked + LAPACK(strti2)(uplo, diag, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_strtri_rec(uplo, diag, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = - A_BL * A_TL + BLAS(strmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); + // A_BL = A_BR \ A_BL + BLAS(strsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TR = - A_TL * A_TR + BLAS(strmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); + // A_TR = A_TR / A_BR + BLAS(strsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_strtri_rec(uplo, diag, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c new file mode 100644 index 000000000..3aa6bf531 --- /dev/null +++ b/relapack/src/zgbtrf.c @@ -0,0 +1,230 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *, + const int *, double *, const int *, int *, double *, const int *, double *, + const int *, int *); + + +/** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's zgbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html + * */ +void RELAPACK_zgbtrf( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kl < 0) + *info = -3; + else if (*ku < 0) + *info = -4; + else if (*ldAb < 2 * *kl + *ku + 1) + *info = -6; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZGBTRF", &minfo); + return; + } + + // Constant + const double ZERO[] = { 0., 0. }; + + // Result upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + 2 * kv; + + // Zero upper diagonal fill-in elements + int i, j; + for (j = 0; j < *n; j++) { + double *const A_j = A + 2 * *ldA * j; + for (i = MAX(0, j - kv); i < j - *ku; i++) + A_j[2 * i] = A_j[2 * i + 1] = 0.; + } + + // Allocate work space + const int n1 = ZREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); + double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); + LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); + LAPACK(zlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + + // Recursive kernel + RELAPACK_zgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); + + // Free work space + free(Workl); + free(Worku); +} + + +/** zgbtrf's recursive compute kernel */ +static void RELAPACK_zgbtrf_rec( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, + int *info +) { + + if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) { + // Unblocked + LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Loop iterators + int i, j; + + // Output upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + 2 * kv; + + // Splitting + const int n1 = MIN(ZREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); + + // Ab_L * + // Ab_BR + double *const Ab_L = Ab; + double *const Ab_BR = Ab + 2 * *ldAb * n1; + + // A_L A_R + double *const A_L = A; + double *const A_R = A + 2 * *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * m1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * m1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // Banded splitting + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); + + // n1 n21 n22 + // m * A_Rl ARr + double *const A_Rl = A_R; + double *const A_Rr = A_R + 2 * *ldA * n21; + + // n1 n21 n22 + // m1 * A_TRl A_TRr + // m21 A_BLt A_BRtl A_BRtr + // m22 A_BLb A_BRbl A_BRbr + double *const A_TRl = A_TR; + double *const A_TRr = A_TR + 2 * *ldA * n21; + double *const A_BLt = A_BL; + double *const A_BLb = A_BL + 2 * m21; + double *const A_BRtl = A_BR; + double *const A_BRtr = A_BR + 2 * *ldA * n21; + double *const A_BRbl = A_BR + 2 * m21; + double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21; + + // recursion(Ab_L, ipiv_T) + RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); + + // Workl = A_BLb + LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); + + // partially redo swaps in A_L + for (i = 0; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); + else + BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); + } + } + + // apply pivots to A_Rl + LAPACK(zlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); + + // apply pivots to A_Rr columnwise + for (j = 0; j < n22; j++) { + double *const A_Rrj = A_Rr + 2 * *ldA * j; + for (i = j; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + const double tmpr = A_Rrj[2 * i]; + const double tmpc = A_Rrj[2 * i + 1]; + A_Rrj[2 * i] = A_Rrj[2 * ip]; + A_Rrj[2 * i + 1] = A_Rrj[2 * ip + 1]; + A_Rrj[2 * ip] = tmpr; + A_Rrj[2 * ip + 1] = tmpc; + } + } + } + + // A_TRl = A_TL \ A_TRl + BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // Worku = A_TRr + LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); + // Worku = A_TL \ Worku + BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); + // A_TRr = Worku + LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); + // A_BRtl = A_BRtl - A_BLt * A_TRl + BLAS(zgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); + // A_BRbl = A_BRbl - Workl * A_TRl + BLAS(zgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); + // A_BRtr = A_BRtr - A_BLt * Worku + BLAS(zgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Workl * Worku + BLAS(zgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); + + // partially undo swaps in A_L + for (i = mn1 - 1; i >= 0; i--) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); + else + BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); + } + } + + // recursion(Ab_BR, ipiv_B) + RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + if (*info) + *info += n1; + // shift pivots + for (i = 0; i < mn2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/zgemmt.c b/relapack/src/zgemmt.c new file mode 100644 index 000000000..aa5930238 --- /dev/null +++ b/relapack/src/zgemmt.c @@ -0,0 +1,167 @@ +#include "relapack.h" + +static void RELAPACK_zgemmt_rec(const char *, const char *, const char *, + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); + +static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *, + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); + + +/** ZGEMMT computes a matrix-matrix product with general matrices but updates + * only the upper or lower triangular part of the result matrix. + * + * This routine performs the same operation as the BLAS routine + * zgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) + * but only updates the triangular part of C specified by uplo: + * If (*uplo == 'L'), only the lower triangular part of C is updated, + * otherwise the upper triangular part is updated. + * */ +void RELAPACK_zgemmt( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + +#if HAVE_XGEMMT + BLAS(zgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; +#else + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int ctransA = LAPACK(lsame)(transA, "C"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + const int ctransB = LAPACK(lsame)(transB, "C"); + int info = 0; + if (!lower && !upper) + info = 1; + else if (!tranA && !ctransA && !notransA) + info = 2; + else if (!tranB && !ctransB && !notransB) + info = 3; + else if (*n < 0) + info = 4; + else if (*k < 0) + info = 5; + else if (*ldA < MAX(1, notransA ? *n : *k)) + info = 8; + else if (*ldB < MAX(1, notransB ? *k : *n)) + info = 10; + else if (*ldC < MAX(1, *n)) + info = 13; + if (info) { + LAPACK(xerbla)("ZGEMMT", &info); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C'); + const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C'); + + // Recursive kernel + RELAPACK_zgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); +#endif +} + + +/** zgemmt's recursive compute kernel */ +static void RELAPACK_zgemmt_rec( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + + if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) { + // Unblocked + RELAPACK_zgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; + } + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_T + // A_B + const double *const A_T = A; + const double *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1); + + // B_L B_R + const double *const B_L = B; + const double *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1); + + // C_TL C_TR + // C_BL C_BR + double *const C_TL = C; + double *const C_TR = C + 2 * *ldC * n1; + double *const C_BL = C + 2 * n1; + double *const C_BR = C + 2 * *ldC * n1 + 2 * n1; + + // recursion(C_TL) + RELAPACK_zgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); + + if (*uplo == 'L') + // C_BL = alpha A_B B_L + beta C_BL + BLAS(zgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); + else + // C_TR = alpha A_T B_R + beta C_TR + BLAS(zgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); + + // recursion(C_BR) + RELAPACK_zgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); +} + + +/** zgemmt's unblocked compute kernel */ +static void RELAPACK_zgemmt_rec2( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; + + int i; + for (i = 0; i < *n; i++) { + // A_0 + // A_i + const double *const A_0 = A; + const double *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i); + + // * B_i * + const double *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i); + + // * C_0i * + // * C_ii * + double *const C_0i = C + 2 * *ldC * i; + double *const C_ii = C + 2 * *ldC * i + 2 * i; + + if (*uplo == 'L') { + const int nmi = *n - i; + if (*transA == 'N') + BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + else + BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + } else { + const int ip1 = i + 1; + if (*transA == 'N') + BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + else + BLAS(zgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + } + } +} diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c new file mode 100644 index 000000000..cf8921e1f --- /dev/null +++ b/relapack/src/zgetrf.c @@ -0,0 +1,117 @@ +#include "relapack.h" + +static void RELAPACK_zgetrf_rec(const int *, const int *, double *, + const int *, int *, int *); + + +/** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's zgetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html + * */ +void RELAPACK_zgetrf( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZGETRF", &minfo); + return; + } + + const int sn = MIN(*m, *n); + + RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info); + + // Right remainder + if (*m < *n) { + // Constants + const double ONE[] = { 1., 0. }; + const int iONE[] = { 1 }; + + // Splitting + const int rn = *n - *m; + + // A_L A_R + const double *const A_L = A; + double *const A_R = A + 2 * *ldA * *m; + + // A_R = apply(ipiv, A_R) + LAPACK(zlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); + // A_R = A_L \ A_R + BLAS(ztrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); + } +} + + +/** zgetrf's recursive compute kernel */ +static void RELAPACK_zgetrf_rec( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + + if (*n <= MAX(CROSSOVER_ZGETRF, 1)) { + // Unblocked + LAPACK(zgetf2)(m, n, A, ldA, ipiv, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1. }; + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; + + // A_L A_R + double *const A_L = A; + double *const A_R = A + 2 * *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // recursion(A_L, ipiv_T) + RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + // apply pivots to A_R + LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); + + // A_TR = A_TL \ A_TR + BLAS(ztrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_BL * A_TR + BLAS(zgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); + + // recursion(A_BR, ipiv_B) + RELAPACK_zgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); + if (*info) + *info += n1; + // apply pivots to A_BL + LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); + // shift pivots + int i; + for (i = 0; i < n2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/zhegst.c b/relapack/src/zhegst.c new file mode 100644 index 000000000..d0ece2148 --- /dev/null +++ b/relapack/src/zhegst.c @@ -0,0 +1,212 @@ +#include "relapack.h" +#if XSYGST_ALLOW_MALLOC +#include "stdlib.h" +#endif + +static void RELAPACK_zhegst_rec(const int *, const char *, const int *, + double *, const int *, const double *, const int *, + double *, const int *, int *); + + +/** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. + * + * This routine is functionally equivalent to LAPACK's zhegst. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html + * */ +void RELAPACK_zhegst( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (*itype < 1 || *itype > 3) + *info = -1; + else if (!lower && !upper) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + else if (*ldB < MAX(1, *n)) + *info = -7; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZHEGST", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Allocate work space + double *Work = NULL; + int lWork = 0; +#if XSYGST_ALLOW_MALLOC + const int n1 = ZREC_SPLIT(*n); + lWork = n1 * (*n - n1); + Work = malloc(lWork * 2 * sizeof(double)); + if (!Work) + lWork = 0; +#endif + + // recursive kernel + RELAPACK_zhegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); + + // Free work space +#if XSYGST_ALLOW_MALLOC + if (Work) + free(Work); +#endif +} + + +/** zhegst's recursive compute kernel */ +static void RELAPACK_zhegst_rec( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + double *Work, const int *lWork, int *info +) { + + if (*n <= MAX(CROSSOVER_ZHEGST, 1)) { + // Unblocked + LAPACK(zhegs2)(itype, uplo, n, A, ldA, B, ldB, info); + return; + } + + // Constants + const double ZERO[] = { 0., 0. }; + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const double HALF[] = { .5, 0. }; + const double MHALF[] = { -.5, 0. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // B_TL B_TR + // B_BL B_BR + const double *const B_TL = B; + const double *const B_TR = B + 2 * *ldB * n1; + const double *const B_BL = B + 2 * n1; + const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // recursion(A_TL, B_TL) + RELAPACK_zhegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); + + if (*itype == 1) + if (*uplo == 'L') { + // A_BL = A_BL / B_TL' + BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork >= n2 * n1) { + // T = -1/2 * B_BL * A_TL + BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + } else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' + BLAS(zher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); + if (*lWork >= n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR \ A_BL + BLAS(ztrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL' \ A_TR + BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork >= n2 * n1) { + // T = -1/2 * A_TL * B_TR + BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + } else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR + BLAS(zher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); + if (*lWork >= n2 * n1) + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR / B_BR + BLAS(ztrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + else + if (*uplo == 'L') { + // A_BL = A_BL * B_TL + BLAS(ztrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork >= n2 * n1) { + // T = 1/2 * A_BR * B_BL + BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + } else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL + BLAS(zher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); + if (*lWork >= n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR * A_BL + BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL * A_TR + BLAS(ztrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork >= n2 * n1) { + // T = 1/2 * B_TR * A_BR + BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + } else + // A_TR = A_TR + 1/2 B_TR A_BR + BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' + BLAS(zher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); + if (*lWork >= n2 * n1) + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + else + // A_TR = A_TR + 1/2 B_TR * A_BR + BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR * B_BR + BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + + // recursion(A_BR, B_BR) + RELAPACK_zhegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); +} diff --git a/relapack/src/zhetrf.c b/relapack/src/zhetrf.c new file mode 100644 index 000000000..ef4e1f5d5 --- /dev/null +++ b/relapack/src/zhetrf.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's zhetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html + * */ +void RELAPACK_zhetrf( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZHETRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** zhetrf's recursive compute kernel */ +static void RELAPACK_zhetrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_ZHETRF, 3)) { + // Unblocked + if (top) { + LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_zhetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + double *const A_BL_B = A + 2 * *n; + double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + 2 * n1; + double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + 2 * *ldA * n_rest; + double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/zhetrf_rec2.c b/relapack/src/zhetrf_rec2.c new file mode 100644 index 000000000..867ea64e1 --- /dev/null +++ b/relapack/src/zhetrf_rec2.c @@ -0,0 +1,524 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; +static int c__1 = 1; + +/** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method + * + * This routine is a minor modification of LAPACK's zlahef. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int * + nb, int *kb, doublecomplex *a, int *lda, int *ipiv, + doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + double d__1, d__2, d__3, d__4; + doublecomplex z__1, z__2, z__3, z__4; + + /* Builtin functions */ + double sqrt(double), d_imag(doublecomplex *); + void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *, + doublecomplex *, doublecomplex *); + + /* Local variables */ + static int j, k; + static double t, r1; + static doublecomplex d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static double alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + static int kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); + static double absakk; + extern /* Subroutine */ int zdscal_(int *, double *, + doublecomplex *, int *); + static double colmax; + extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) + ; + extern int izamax_(int *, doublecomplex *, int *); + static double rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + i__1 = k - 1; + zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k + kw * w_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + i__1 = k + kw * w_dim1; + i__2 = k + kw * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + i__1 = k + kw * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)); + if (k > 1) { + i__1 = k - 1; + imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + kw * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - 1; + zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + imax * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + i__1 = k - imax; + zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + i__1 = k - imax; + zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + (kw - 1) * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + i__1 = k - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + jmax + (kw - 1) * w_dim1]), abs(d__2)); + if (imax > 1) { + i__1 = imax - 1; + jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + i__1 = jmax + (kw - 1) * w_dim1; + d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( + d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs( + d__2)); + rowmax = max(d__3,d__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (kw - 1) * w_dim1; + if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) { + kp = imax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = kk - 1 - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + i__1 = kk - 1 - kp; + zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + r1 = 1. / a[i__1].r; + i__1 = k - 1; + zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + d_cnjg(&z__2, &d21); + z_div(&z__1, &w[k + kw * w_dim1], &z__2); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = z__1.r, d22.i = z__1.i; + z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1. / (z__1.r - 1.); + z__2.r = t, z__2.i = 0.; + z_div(&z__1, &z__2, &d21); + d21.r = z__1.r, d21.i = z__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + k * a_dim1; + d_cnjg(&z__2, &d21); + i__3 = j + kw * w_dim1; + z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i = + z__2.r * z__3.i + z__2.i * z__3.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1; + zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k - 2; + zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + i__1 = k + k * w_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (k < *n) { + i__1 = *n - k; + zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * + w_dim1], &c__1); + } + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); + i__1 = k + k * w_dim1; + i__2 = k + k * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + i__1 = k + k * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + k * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = imax - k; + zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + imax * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (imax < *n) { + i__1 = *n - imax; + zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ + imax + 1 + (k + 1) * w_dim1], &c__1); + } + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + (k + 1) * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + i__1 = imax - k; + jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + jmax + (k + 1) * w_dim1]), abs(d__2)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + i__1 = jmax + (k + 1) * w_dim1; + d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( + d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs( + d__2)); + rowmax = max(d__3,d__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (k + 1) * w_dim1; + if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) { + kp = imax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + + k * w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k + kstep - 1; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = kp - kk - 1; + zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + i__1 = kp - kk - 1; + zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + r1 = 1. / a[i__1].r; + i__1 = *n - k; + zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + i__1 = *n - k; + zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = z__1.r, d11.i = z__1.i; + d_cnjg(&z__2, &d21); + z_div(&z__1, &w[k + k * w_dim1], &z__2); + d22.r = z__1.r, d22.i = z__1.i; + z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1. / (z__1.r - 1.); + z__2.r = t, z__2.i = 0.; + z_div(&z__1, &z__2, &d21); + d21.r = z__1.r, d21.i = z__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + d_cnjg(&z__2, &d21); + i__3 = j + k * w_dim1; + z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i = + z__2.r * z__3.i + z__2.i * z__3.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = *n - k; + zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = *n - k - 1; + zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/zhetrf_rook.c b/relapack/src/zhetrf_rook.c new file mode 100644 index 000000000..15ceaeae7 --- /dev/null +++ b/relapack/src/zhetrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's zhetrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html + * */ +void RELAPACK_zhetrf_rook( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZHETRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** zhetrf_rook's recursive compute kernel */ +static void RELAPACK_zhetrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) { + // Unblocked + if (top) { + LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_zhetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + double *const A_BL_B = A + 2 * *n; + double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + 2 * n1; + double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + 2 * *ldA * n_rest; + double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/zhetrf_rook_rec2.c b/relapack/src/zhetrf_rook_rec2.c new file mode 100644 index 000000000..a56ad710b --- /dev/null +++ b/relapack/src/zhetrf_rook_rec2.c @@ -0,0 +1,662 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; +static int c__1 = 1; + +/** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method + * + * This routine is a minor modification of LAPACK's zlahef_rook. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, doublecomplex *a, int *lda, int * + ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + double d__1, d__2; + doublecomplex z__1, z__2, z__3, z__4, z__5; + + /* Builtin functions */ + double sqrt(double), d_imag(doublecomplex *); + void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *, + doublecomplex *, doublecomplex *); + + /* Local variables */ + static int j, k, p; + static double t, r1; + static doublecomplex d11, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static double alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + static double dtemp, sfmin; + static int itemp, kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); + extern double dlamch_(char *, ftnlen); + static double absakk; + extern /* Subroutine */ int zdscal_(int *, double *, + doublecomplex *, int *); + static double colmax; + extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) + ; + extern int izamax_(int *, doublecomplex *, int *); + static double rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + sfmin = dlamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + if (k > 1) { + i__1 = k - 1; + zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], & + c__1); + } + i__1 = k + kw * w_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + i__1 = k + kw * w_dim1; + i__2 = k + kw * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + i__1 = k + kw * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)); + if (k > 1) { + i__1 = k - 1; + imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + kw * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + d__1 = w[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + if (k > 1) { + i__1 = k - 1; + zcopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], + &c__1); + } + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + if (imax > 1) { + i__1 = imax - 1; + zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + } + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + imax * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + i__1 = k - imax; + zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + i__1 = k - imax; + zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + (kw - 1) * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& + w[jmax + (kw - 1) * w_dim1]), abs(d__2)); + } else { + rowmax = 0.; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + i__1 = itemp + (kw - 1) * w_dim1; + dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + itemp + (kw - 1) * w_dim1]), abs(d__2)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + i__1 = imax + (kw - 1) * w_dim1; + if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) { + kp = imax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = p + p * a_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = k - 1 - p; + zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + i__1 = k - 1 - p; + zlacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda); + if (p > 1) { + i__1 = p - 1; + zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + zswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = kk - 1 - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + i__1 = kk - 1 - kp; + zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + t = a[i__1].r; + if (abs(t) >= sfmin) { + r1 = 1. / t; + i__1 = k - 1; + zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + i__3 = ii + k * a_dim1; + z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L14: */ + } + } + i__1 = k - 1; + zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + d_cnjg(&z__2, &d21); + z_div(&z__1, &w[k + kw * w_dim1], &z__2); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = z__1.r, d22.i = z__1.i; + z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1. / (z__1.r - 1.); + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d21); + z__1.r = t * z__2.r, z__1.i = t * z__2.i; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + d_cnjg(&z__5, &d21); + z_div(&z__2, &z__3, &z__5); + z__1.r = t * z__2.r, z__1.i = t * z__2.i; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1; + zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k - 2; + zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + ++jj; + if (kstep == 2 && jp1 != jj && j <= *n) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = k + k * w_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (k < *n) { + i__1 = *n - k; + zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * + w_dim1], &c__1); + } + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + i__1 = k + k * w_dim1; + i__2 = k + k * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + i__1 = k + k * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + k * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + d__1 = w[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + if (k < *n) { + i__1 = *n - k; + zcopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k * + a_dim1], &c__1); + } + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = imax - k; + zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + imax * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (imax < *n) { + i__1 = *n - imax; + zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ + imax + 1 + (k + 1) * w_dim1], &c__1); + } + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + (k + 1) * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& + w[jmax + (k + 1) * w_dim1]), abs(d__2)); + } else { + rowmax = 0.; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + i__1 = itemp + (k + 1) * w_dim1; + dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + itemp + (k + 1) * w_dim1]), abs(d__2)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + i__1 = imax + (k + 1) * w_dim1; + if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) { + kp = imax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p + p * a_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = p - k - 1; + zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) * + a_dim1], lda); + i__1 = p - k - 1; + zlacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda); + if (p < *n) { + i__1 = *n - p; + zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p + * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + zswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + } + zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = kp - kk - 1; + zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + i__1 = kp - kk - 1; + zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + t = a[i__1].r; + if (abs(t) >= sfmin) { + r1 = 1. / t; + i__1 = *n - k; + zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + i__3 = ii + k * a_dim1; + z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L74: */ + } + } + i__1 = *n - k; + zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = z__1.r, d11.i = z__1.i; + d_cnjg(&z__2, &d21); + z_div(&z__1, &w[k + k * w_dim1], &z__2); + d22.r = z__1.r, d22.i = z__1.i; + z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1. / (z__1.r - 1.); + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + d_cnjg(&z__5, &d21); + z_div(&z__2, &z__3, &z__5); + z__1.r = t * z__2.r, z__1.i = t * z__2.i; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d21); + z__1.r = t * z__2.r, z__1.i = t * z__2.i; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = *n - k; + zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = *n - k - 1; + zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + --jj; + if (kstep == 2 && jp1 != jj && j >= 1) { + zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/zlauum.c b/relapack/src/zlauum.c new file mode 100644 index 000000000..490dcc82e --- /dev/null +++ b/relapack/src/zlauum.c @@ -0,0 +1,87 @@ +#include "relapack.h" + +static void RELAPACK_zlauum_rec(const char *, const int *, double *, + const int *, int *); + + +/** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. + * + * This routine is functionally equivalent to LAPACK's zlauum. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html + * */ +void RELAPACK_zlauum( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZLAUUM", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_zlauum_rec(&cleanuplo, n, A, ldA, info); +} + + +/** zlauum's recursive compute kernel */ +static void RELAPACK_zlauum_rec( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) { + // Unblocked + LAPACK(zlauu2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_zlauum_rec(uplo, &n1, A_TL, ldA, info); + + if (*uplo == 'L') { + // A_TL = A_TL + A_BL' * A_BL + BLAS(zherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); + // A_BL = A_BR' * A_BL + BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TL = A_TL + A_TR * A_TR' + BLAS(zherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); + // A_TR = A_TR * A_BR' + BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_zlauum_rec(uplo, &n2, A_BR, ldA, info); +} diff --git a/relapack/src/zpbtrf.c b/relapack/src/zpbtrf.c new file mode 100644 index 000000000..37e711c9d --- /dev/null +++ b/relapack/src/zpbtrf.c @@ -0,0 +1,157 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *, + double *, const int *, double *, const int *, int *); + + +/** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. + * + * This routine is functionally equivalent to LAPACK's zpbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html + * */ +void RELAPACK_zpbtrf( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kd < 0) + *info = -3; + else if (*ldAb < *kd + 1) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZPBTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Constant + const double ZERO[] = { 0., 0. }; + + // Allocate work space + const int n1 = ZREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + double *Work = malloc(mWork * nWork * 2 * sizeof(double)); + LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); + + // Recursive kernel + RELAPACK_zpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); + + // Free work space + free(Work); +} + + +/** zpbtrf's recursive compute kernel */ +static void RELAPACK_zpbtrf_rec( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + double *Work, const int *ldWork, + int *info +){ + + if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) { + // Unblocked + LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); + + // Splitting + const int n1 = MIN(ZREC_SPLIT(*n), *kd); + const int n2 = *n - n1; + + // * * + // * Ab_BR + double *const Ab_BR = Ab + 2 * *ldAb * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_zpotrf(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + // Banded splitting + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, *kd); + + // n1 n21 n22 + // n1 * A_TRl A_TRr + // n21 A_BLt A_BRtl A_BRtr + // n22 A_BLb A_BRbl A_BRbr + double *const A_TRl = A_TR; + double *const A_TRr = A_TR + 2 * *ldA * n21; + double *const A_BLt = A_BL; + double *const A_BLb = A_BL + 2 * n21; + double *const A_BRtl = A_BR; + double *const A_BRtr = A_BR + 2 * *ldA * n21; + double *const A_BRbl = A_BR + 2 * n21; + double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21; + + if (*uplo == 'L') { + // A_BLt = ABLt / A_TL' + BLAS(ztrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); + // A_BRtl = A_BRtl - A_BLt * A_BLt' + BLAS(zherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); + // Work = A_BLb + LAPACK(zlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); + // Work = Work / A_TL' + BLAS(ztrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); + // A_BRbl = A_BRbl - Work * A_BLt' + BLAS(zgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); + // A_BRbr = A_BRbr - Work * Work' + BLAS(zherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_BLb = Work + LAPACK(zlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); + } else { + // A_TRl = A_TL' \ A_TRl + BLAS(ztrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // A_BRtl = A_BRtl - A_TRl' * A_TRl + BLAS(zherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); + // Work = A_TRr + LAPACK(zlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); + // Work = A_TL' \ Work + BLAS(ztrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); + // A_BRtr = A_BRtr - A_TRl' * Work + BLAS(zgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Work' * Work + BLAS(zherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_TRr = Work + LAPACK(zlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); + } + + // recursion(A_BR) + if (*kd > n1) + RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info); + else + RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/zpotrf.c b/relapack/src/zpotrf.c new file mode 100644 index 000000000..411ac5fc0 --- /dev/null +++ b/relapack/src/zpotrf.c @@ -0,0 +1,92 @@ +#include "relapack.h" + +static void RELAPACK_zpotrf_rec(const char *, const int *, double *, + const int *, int *); + + +/** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. + * + * This routine is functionally equivalent to LAPACK's zpotrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html + * */ +void RELAPACK_zpotrf( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZPOTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_zpotrf_rec(&cleanuplo, n, A, ldA, info); +} + + +/** zpotrf's recursive compute kernel */ +static void RELAPACK_zpotrf_rec( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) { + // Unblocked + LAPACK(zpotf2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_zpotrf_rec(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = A_BL / A_TL' + BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); + // A_BR = A_BR - A_BL * A_BL' + BLAS(zherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); + } else { + // A_TR = A_TL' \ A_TR + BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_TR' * A_TR + BLAS(zherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); + } + + // recursion(A_BR) + RELAPACK_zpotrf_rec(uplo, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/zsytrf.c b/relapack/src/zsytrf.c new file mode 100644 index 000000000..3be21563a --- /dev/null +++ b/relapack/src/zsytrf.c @@ -0,0 +1,238 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's zsytrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html + * */ +void RELAPACK_zsytrf( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy arguments + int nout; + + // Recursive kernel + RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** zsytrf's recursive compute kernel */ +static void RELAPACK_zsytrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) { + // Unblocked + if (top) { + LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_zsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + double *const A_BL_B = A + 2 * *n; + double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + 2 * n1; + double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + 2 * *ldA * n_rest; + double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/zsytrf_rec2.c b/relapack/src/zsytrf_rec2.c new file mode 100644 index 000000000..33902ee9e --- /dev/null +++ b/relapack/src/zsytrf_rec2.c @@ -0,0 +1,452 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; +static int c__1 = 1; + +/** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. + * + * This routine is a minor modification of LAPACK's zlasyf. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int * + nb, int *kb, doublecomplex *a, int *lda, int *ipiv, + doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + double d__1, d__2, d__3, d__4; + doublecomplex z__1, z__2, z__3; + + /* Builtin functions */ + double sqrt(double), d_imag(doublecomplex *); + void z_div(doublecomplex *, doublecomplex *, doublecomplex *); + + /* Local variables */ + static int j, k; + static doublecomplex t, r1, d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static double alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int zscal_(int *, doublecomplex *, + doublecomplex *, int *); + static int kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); + static double absakk, colmax; + extern int izamax_(int *, doublecomplex *, int *); + static double rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + kstep = 1; + i__1 = k + kw * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw * + w_dim1]), abs(d__2)); + if (k > 1) { + i__1 = k - 1; + imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + kw * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + i__1 = k - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + jmax + (kw - 1) * w_dim1]), abs(d__2)); + if (imax > 1) { + i__1 = imax - 1; + jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + i__1 = jmax + (kw - 1) * w_dim1; + d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( + d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs( + d__2)); + rowmax = max(d__3,d__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (kw - 1) * w_dim1; + if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + imax + (kw - 1) * w_dim1]), abs(d__2)) >= alpha * + rowmax) { + kp = imax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kk - 1 - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + z_div(&z__1, &c_b1, &a[k + k * a_dim1]); + r1.r = z__1.r, r1.i = z__1.i; + i__1 = k - 1; + zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + z_div(&z__1, &w[k + kw * w_dim1], &d21); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = z__1.r, d22.i = z__1.i; + z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * + d22.i + d11.i * d22.r; + z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; + z_div(&z__1, &c_b1, &z__2); + t.r = z__1.r, t.i = z__1.i; + z_div(&z__1, &t, &d21); + d21.r = z__1.r, d21.i = z__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + i__1 = *n - k + 1; + zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); + kstep = 1; + i__1 = k + k * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k * + w_dim1]), abs(d__2)); + if (k < *n) { + i__1 = *n - k; + imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + k * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax - k; + jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + jmax + (k + 1) * w_dim1]), abs(d__2)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + i__1 = jmax + (k + 1) * w_dim1; + d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( + d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs( + d__2)); + rowmax = max(d__3,d__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (k + 1) * w_dim1; + if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + imax + (k + 1) * w_dim1]), abs(d__2)) >= alpha * + rowmax) { + kp = imax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + + k * w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k + kstep - 1; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kp - kk - 1; + zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + z_div(&z__1, &c_b1, &a[k + k * a_dim1]); + r1.r = z__1.r, r1.i = z__1.i; + i__1 = *n - k; + zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k + k * w_dim1], &d21); + d22.r = z__1.r, d22.i = z__1.i; + z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * + d22.i + d11.i * d22.r; + z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; + z_div(&z__1, &c_b1, &z__2); + t.r = z__1.r, t.i = z__1.i; + z_div(&z__1, &t, &d21); + d21.r = z__1.r, d21.i = z__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/zsytrf_rook.c b/relapack/src/zsytrf_rook.c new file mode 100644 index 000000000..c598f7b1e --- /dev/null +++ b/relapack/src/zsytrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include +#endif + +static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's zsytrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html + * */ +void RELAPACK_zsytrf_rook( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** zsytrf_rook's recursive compute kernel */ +static void RELAPACK_zsytrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) { + // Unblocked + if (top) { + LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_zsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + double *const A_BL_B = A + 2 * *n; + double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + 2 * n1; + double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + 2 * *ldA * n_rest; + double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/zsytrf_rook_rec2.c b/relapack/src/zsytrf_rook_rec2.c new file mode 100644 index 000000000..9e111fe0c --- /dev/null +++ b/relapack/src/zsytrf_rook_rec2.c @@ -0,0 +1,561 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; +static int c__1 = 1; + +/** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. + * + * This routine is a minor modification of LAPACK's zlasyf_rook. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, doublecomplex *a, int *lda, int * + ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + double d__1, d__2; + doublecomplex z__1, z__2, z__3, z__4; + + /* Builtin functions */ + double sqrt(double), d_imag(doublecomplex *); + void z_div(doublecomplex *, doublecomplex *, doublecomplex *); + + /* Local variables */ + static int j, k, p; + static doublecomplex t, r1, d11, d12, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static double alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + static double dtemp, sfmin; + extern /* Subroutine */ int zscal_(int *, doublecomplex *, + doublecomplex *, int *); + static int itemp, kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); + extern double dlamch_(char *, ftnlen); + static double absakk, colmax; + extern int izamax_(int *, doublecomplex *, int *); + static double rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + sfmin = dlamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + i__1 = k + kw * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw * + w_dim1]), abs(d__2)); + if (k > 1) { + i__1 = k - 1; + imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + kw * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& + w[jmax + (kw - 1) * w_dim1]), abs(d__2)); + } else { + rowmax = 0.; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + i__1 = itemp + (kw - 1) * w_dim1; + dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + itemp + (kw - 1) * w_dim1]), abs(d__2)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + i__1 = imax + (kw - 1) * w_dim1; + if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + (kw - 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) { + kp = imax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = k - p; + zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + zcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & + c__1); + i__1 = *n - k + 1; + zswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], + lda); + i__1 = *n - kk + 1; + zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + i__1 = kp + k * a_dim1; + i__2 = kk + k * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = k - 1 - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + zcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & + c__1); + i__1 = *n - kk + 1; + zswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], + lda); + i__1 = *n - kk + 1; + zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k + + k * a_dim1]), abs(d__2)) >= sfmin) { + z_div(&z__1, &c_b1, &a[k + k * a_dim1]); + r1.r = z__1.r, r1.i = z__1.i; + i__1 = k - 1; + zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else /* if(complicated condition) */ { + i__1 = k + k * a_dim1; + if (a[i__1].r != 0. || a[i__1].i != 0.) { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + z_div(&z__1, &a[ii + k * a_dim1], &a[k + k * + a_dim1]); + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L14: */ + } + } + } + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d12.r = w[i__1].r, d12.i = w[i__1].i; + z_div(&z__1, &w[k + kw * w_dim1], &d12); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d12); + d22.r = z__1.r, d22.i = z__1.i; + z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * + d22.i + d11.i * d22.r; + z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; + z_div(&z__1, &c_b1, &z__2); + t.r = z__1.r, t.i = z__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d12); + z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * + z__2.i + t.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d12); + z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * + z__2.i + t.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + jj = j - 1; + if (jp1 != jj && kstep == 2) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j <= *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = *n - k + 1; + zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + } + i__1 = k + k * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k * + w_dim1]), abs(d__2)); + if (k < *n) { + i__1 = *n - k; + imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + k * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& + w[jmax + (k + 1) * w_dim1]), abs(d__2)); + } else { + rowmax = 0.; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + i__1 = itemp + (k + 1) * w_dim1; + dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + itemp + (k + 1) * w_dim1]), abs(d__2)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + i__1 = imax + (k + 1) * w_dim1; + if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + (k + 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) { + kp = imax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p - k; + zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], + lda); + i__1 = *n - p + 1; + zcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & + c__1); + zswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + i__1 = kp + k * a_dim1; + i__2 = kk + k * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kp - k - 1; + zcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) + * a_dim1], lda); + i__1 = *n - kp + 1; + zcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * + a_dim1], &c__1); + zswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k + + k * a_dim1]), abs(d__2)) >= sfmin) { + z_div(&z__1, &c_b1, &a[k + k * a_dim1]); + r1.r = z__1.r, r1.i = z__1.i; + i__1 = *n - k; + zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else /* if(complicated condition) */ { + i__1 = k + k * a_dim1; + if (a[i__1].r != 0. || a[i__1].i != 0.) { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + z_div(&z__1, &a[ii + k * a_dim1], &a[k + k * + a_dim1]); + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L74: */ + } + } + } + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k + k * w_dim1], &d21); + d22.r = z__1.r, d22.i = z__1.i; + z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * + d22.i + d11.i * d22.r; + z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; + z_div(&z__1, &c_b1, &z__2); + t.r = z__1.r, t.i = z__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d21); + z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * + z__2.i + t.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d21); + z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * + z__2.i + t.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + jj = j + 1; + if (jp1 != jj && kstep == 2) { + zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j >= 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/ztgsyl.c b/relapack/src/ztgsyl.c new file mode 100644 index 000000000..2c8a35256 --- /dev/null +++ b/relapack/src/ztgsyl.c @@ -0,0 +1,268 @@ +#include "relapack.h" +#include + +static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *, + const int *, const double *, const int *, const double *, const int *, + double *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, double *, double *, int *); + + +/** ZTGSYL solves the generalized Sylvester equation. + * + * This routine is functionally equivalent to LAPACK's ztgsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html + * */ +void RELAPACK_ztgsyl( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + + // Parse arguments + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "C"); + + // Compute work buffer size + int lwmin = 1; + if (notran && (*ijob == 1 || *ijob == 2)) + lwmin = MAX(1, 2 * *m * *n); + *info = 0; + + // Check arguments + if (!tran && !notran) + *info = -1; + else if (notran && (*ijob < 0 || *ijob > 4)) + *info = -2; + else if (*m <= 0) + *info = -3; + else if (*n <= 0) + *info = -4; + else if (*ldA < MAX(1, *m)) + *info = -6; + else if (*ldB < MAX(1, *n)) + *info = -8; + else if (*ldC < MAX(1, *m)) + *info = -10; + else if (*ldD < MAX(1, *m)) + *info = -12; + else if (*ldE < MAX(1, *n)) + *info = -14; + else if (*ldF < MAX(1, *m)) + *info = -16; + else if (*lWork < lwmin && *lWork != -1) + *info = -20; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZTGSYL", &minfo); + return; + } + + if (*lWork == -1) { + // Work size query + *Work = lwmin; + return; + } + + // Clean char * arguments + const char cleantrans = notran ? 'N' : 'C'; + + // Constant + const double ZERO[] = { 0., 0. }; + + int isolve = 1; + int ifunc = 0; + if (notran) { + if (*ijob >= 3) { + ifunc = *ijob - 2; + LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF); + } else if (*ijob >= 1) + isolve = 2; + } + + double scale2; + int iround; + for (iround = 1; iround <= isolve; iround++) { + *scale = 1; + double dscale = 0; + double dsum = 1; + RELAPACK_ztgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info); + if (dscale != 0) { + if (*ijob == 1 || *ijob == 3) + *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); + else + *dif = sqrt(*m * *n) / (dscale * sqrt(dsum)); + } + if (isolve == 2) { + if (iround == 1) { + if (notran) + ifunc = *ijob; + scale2 = *scale; + LAPACK(zlacpy)("F", m, n, C, ldC, Work, m); + LAPACK(zlacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m); + LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF); + } else { + LAPACK(zlacpy)("F", m, n, Work, m, C, ldC); + LAPACK(zlacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF); + *scale = scale2; + } + } + } +} + + +/** ztgsyl's recursive vompute kernel */ +static void RELAPACK_ztgsyl_rec( + const char *trans, const int *ifunc, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dsum, double *dscale, + int *info +) { + + if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) { + // Unblocked + LAPACK(ztgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Outputs + double scale1[] = { 1., 0. }; + double scale2[] = { 1., 0. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + const int m1 = ZREC_SPLIT(*m); + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const double *const A_TL = A; + const double *const A_TR = A + 2 * *ldA * m1; + const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1; + + // C_T + // C_B + double *const C_T = C; + double *const C_B = C + 2 * m1; + + // D_TL D_TR + // 0 D_BR + const double *const D_TL = D; + const double *const D_TR = D + 2 * *ldD * m1; + const double *const D_BR = D + 2 * *ldD * m1 + 2 * m1; + + // F_T + // F_B + double *const F_T = F; + double *const F_B = F + 2 * m1; + + if (*trans == 'N') { + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1); + // C_T = C_T - A_TR * C_B + BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // F_T = F_T - D_TR * C_B + BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); + } + } else { + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); + // C_B = C_B - A_TR^H * C_T + BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // C_B = C_B - D_TR^H * F_T + BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); + } + } + } else { + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const double *const B_TL = B; + const double *const B_TR = B + 2 * *ldB * n1; + const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // C_L C_R + double *const C_L = C; + double *const C_R = C + 2 * *ldC * n1; + + // E_TL E_TR + // 0 E_BR + const double *const E_TL = E; + const double *const E_TR = E + 2 * *ldE * n1; + const double *const E_BR = E + 2 * *ldE * n1 + 2 * n1; + + // F_L F_R + double *const F_L = F; + double *const F_R = F + 2 * *ldF * n1; + + if (*trans == 'N') { + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1); + // C_R = C_R + F_L * B_TR + BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); + // F_R = F_R + F_L * E_TR + BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); + } + } else { + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); + // F_L = F_L + C_R * B_TR + BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); + // F_L = F_L + F_R * E_TR + BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); + } + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/ztrsyl.c b/relapack/src/ztrsyl.c new file mode 100644 index 000000000..82b2c8803 --- /dev/null +++ b/relapack/src/ztrsyl.c @@ -0,0 +1,163 @@ +#include "relapack.h" + +static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *, + const int *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, int *); + + +/** ZTRSYL solves the complex Sylvester matrix equation. + * + * This routine is functionally equivalent to LAPACK's ztrsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html + * */ +void RELAPACK_ztrsyl( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + + // Check arguments + const int notransA = LAPACK(lsame)(tranA, "N"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int ctransB = LAPACK(lsame)(tranB, "C"); + *info = 0; + if (!ctransA && !notransA) + *info = -1; + else if (!ctransB && !notransB) + *info = -2; + else if (*isgn != 1 && *isgn != -1) + *info = -3; + else if (*m < 0) + *info = -4; + else if (*n < 0) + *info = -5; + else if (*ldA < MAX(1, *m)) + *info = -7; + else if (*ldB < MAX(1, *n)) + *info = -9; + else if (*ldC < MAX(1, *m)) + *info = -11; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZTRSYL", &minfo); + return; + } + + // Clean char * arguments + const char cleantranA = notransA ? 'N' : 'C'; + const char cleantranB = notransB ? 'N' : 'C'; + + // Recursive kernel + RELAPACK_ztrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} + + +/** ztrsyl's recursive compute kernel */ +static void RELAPACK_ztrsyl_rec( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + + if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) { + // Unblocked + RELAPACK_ztrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const double MSGN[] = { -*isgn, 0. }; + const int iONE[] = { 1 }; + + // Outputs + double scale1[] = { 1., 0. }; + double scale2[] = { 1., 0. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + const int m1 = ZREC_SPLIT(*m); + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const double *const A_TL = A; + const double *const A_TR = A + 2 * *ldA * m1; + const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1; + + // C_T + // C_B + double *const C_T = C; + double *const C_B = C + 2 * m1; + + if (*tranA == 'N') { + // recusion(A_BR, B, C_B) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); + // C_T = C_T - A_TR * C_B + BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // recusion(A_TL, B, C_T) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + } else { + // recusion(A_TL, B, C_T) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); + // C_B = C_B - A_TR' * C_T + BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // recusion(A_BR, B, C_B) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); + } + } else { + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const double *const B_TL = B; + const double *const B_TR = B + 2 * *ldB * n1; + const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // C_L C_R + double *const C_L = C; + double *const C_R = C + 2 * *ldC * n1; + + if (*tranB == 'N') { + // recusion(A, B_TL, C_L) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); + // C_R = C_R -/+ C_L * B_TR + BLAS(zgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); + // recusion(A, B_BR, C_R) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + } else { + // recusion(A, B_BR, C_R) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); + // C_L = C_L -/+ C_R * B_TR' + BLAS(zgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); + // recusion(A, B_TL, C_L) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c new file mode 100644 index 000000000..526ab097c --- /dev/null +++ b/relapack/src/ztrsyl_rec2.c @@ -0,0 +1,394 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "../config.h" +#include "f2c.h" + +#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES +doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { + extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); + doublecomplex result; + zdotu_(&result, n, x, incx, y, incy); + return result; +} +#define zdotu_ zdotu_fun + +doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { + extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); + doublecomplex result; + zdotc_(&result, n, x, incx, y, incy); + return result; +} +#define zdotc_ zdotc_fun +#endif + +#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES +doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) { + extern void zladiv_(doublecomplex *, doublecomplex *, doublecomplex *); + doublecomplex result; + zladiv_(&result, a, b); + return result; +} +#define zladiv_ zladiv_fun +#endif + +/* Table of constant values */ + +static int c__1 = 1; + +/** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) + * + * This routine is an exact copy of LAPACK's ztrsyl. + * It serves as an unblocked kernel in the recursive algorithms. + * */ +/* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int + *isgn, int *m, int *n, doublecomplex *a, int *lda, + doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc, + double *scale, int *info, ftnlen trana_len, ftnlen tranb_len) +{ + /* System generated locals */ + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4; + double d__1, d__2; + doublecomplex z__1, z__2, z__3, z__4; + + /* Builtin functions */ + double d_imag(doublecomplex *); + void d_cnjg(doublecomplex *, doublecomplex *); + + /* Local variables */ + static int j, k, l; + static doublecomplex a11; + static double db; + static doublecomplex x11; + static double da11; + static doublecomplex vec; + static double dum[1], eps, sgn, smin; + static doublecomplex suml, sumr; + extern int lsame_(char *, char *, ftnlen, ftnlen); + /* Double Complex */ doublecomplex zdotc_(int *, + doublecomplex *, int *, doublecomplex *, int *), zdotu_( + int *, doublecomplex *, int *, + doublecomplex *, int *); + extern /* Subroutine */ int dlabad_(double *, double *); + extern double dlamch_(char *, ftnlen); + static double scaloc; + extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + extern double zlange_(char *, int *, int *, doublecomplex *, + int *, double *, ftnlen); + static double bignum; + extern /* Subroutine */ int zdscal_(int *, double *, + doublecomplex *, int *); + /* Double Complex */ doublecomplex zladiv_(doublecomplex *, + doublecomplex *); + static int notrna, notrnb; + static double smlnum; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1; + c__ -= c_offset; + + /* Function Body */ + notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); + notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); + *info = 0; + if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < max(1,*m)) { + *info = -7; + } else if (*ldb < max(1,*n)) { + *info = -9; + } else if (*ldc < max(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZTRSY2", &i__1, (ftnlen)6); + return; + } + *scale = 1.; + if (*m == 0 || *n == 0) { + return; + } + eps = dlamch_("P", (ftnlen)1); + smlnum = dlamch_("S", (ftnlen)1); + bignum = 1. / smlnum; + dlabad_(&smlnum, &bignum); + smlnum = smlnum * (double) (*m * *n) / eps; + bignum = 1. / smlnum; +/* Computing MAX */ + d__1 = smlnum, d__2 = eps * zlange_("M", m, m, &a[a_offset], lda, dum, ( + ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * zlange_("M", n, n, + &b[b_offset], ldb, dum, (ftnlen)1); + smin = max(d__1,d__2); + sgn = (double) (*isgn); + if (notrna && notrnb) { + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + for (k = *m; k >= 1; --k) { + i__2 = *m - k; +/* Computing MIN */ + i__3 = k + 1; +/* Computing MIN */ + i__4 = k + 1; + z__1 = zdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[ + min(i__4,*m) + l * c_dim1], &c__1); + suml.r = z__1.r, suml.i = z__1.i; + i__2 = l - 1; + z__1 = zdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] + , &c__1); + sumr.r = z__1.r, sumr.i = z__1.i; + i__2 = k + l * c_dim1; + z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i; + z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; + z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i; + vec.r = z__1.r, vec.i = z__1.i; + scaloc = 1.; + i__2 = k + k * a_dim1; + i__3 = l + l * b_dim1; + z__2.r = sgn * b[i__3].r, z__2.i = sgn * b[i__3].i; + z__1.r = a[i__2].r + z__2.r, z__1.i = a[i__2].i + z__2.i; + a11.r = z__1.r, a11.i = z__1.i; + da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( + d__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.; + da11 = smin; + *info = 1; + } + db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( + d__2)); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + z__3.r = scaloc, z__3.i = 0.; + z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * + z__3.i + vec.i * z__3.r; + z__1 = zladiv_(&z__2, &a11); + x11.r = z__1.r, x11.i = z__1.i; + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L10: */ + } + *scale *= scaloc; + } + i__2 = k + l * c_dim1; + c__[i__2].r = x11.r, c__[i__2].i = x11.i; +/* L20: */ + } +/* L30: */ + } + } else if (! notrna && notrnb) { + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + i__2 = *m; + for (k = 1; k <= i__2; ++k) { + i__3 = k - 1; + z__1 = zdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l * + c_dim1 + 1], &c__1); + suml.r = z__1.r, suml.i = z__1.i; + i__3 = l - 1; + z__1 = zdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] + , &c__1); + sumr.r = z__1.r, sumr.i = z__1.i; + i__3 = k + l * c_dim1; + z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i; + z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; + z__1.r = c__[i__3].r - z__2.r, z__1.i = c__[i__3].i - z__2.i; + vec.r = z__1.r, vec.i = z__1.i; + scaloc = 1.; + d_cnjg(&z__2, &a[k + k * a_dim1]); + i__3 = l + l * b_dim1; + z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i; + z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; + a11.r = z__1.r, a11.i = z__1.i; + da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( + d__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.; + da11 = smin; + *info = 1; + } + db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( + d__2)); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + z__3.r = scaloc, z__3.i = 0.; + z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * + z__3.i + vec.i * z__3.r; + z__1 = zladiv_(&z__2, &a11); + x11.r = z__1.r, x11.i = z__1.i; + if (scaloc != 1.) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L40: */ + } + *scale *= scaloc; + } + i__3 = k + l * c_dim1; + c__[i__3].r = x11.r, c__[i__3].i = x11.i; +/* L50: */ + } +/* L60: */ + } + } else if (! notrna && ! notrnb) { + for (l = *n; l >= 1; --l) { + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = k - 1; + z__1 = zdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l * + c_dim1 + 1], &c__1); + suml.r = z__1.r, suml.i = z__1.i; + i__2 = *n - l; +/* Computing MIN */ + i__3 = l + 1; +/* Computing MIN */ + i__4 = l + 1; + z__1 = zdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[ + l + min(i__4,*n) * b_dim1], ldb); + sumr.r = z__1.r, sumr.i = z__1.i; + i__2 = k + l * c_dim1; + d_cnjg(&z__4, &sumr); + z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i; + z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; + z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i; + vec.r = z__1.r, vec.i = z__1.i; + scaloc = 1.; + i__2 = k + k * a_dim1; + i__3 = l + l * b_dim1; + z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i; + z__2.r = a[i__2].r + z__3.r, z__2.i = a[i__2].i + z__3.i; + d_cnjg(&z__1, &z__2); + a11.r = z__1.r, a11.i = z__1.i; + da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( + d__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.; + da11 = smin; + *info = 1; + } + db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( + d__2)); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + z__3.r = scaloc, z__3.i = 0.; + z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * + z__3.i + vec.i * z__3.r; + z__1 = zladiv_(&z__2, &a11); + x11.r = z__1.r, x11.i = z__1.i; + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L70: */ + } + *scale *= scaloc; + } + i__2 = k + l * c_dim1; + c__[i__2].r = x11.r, c__[i__2].i = x11.i; +/* L80: */ + } +/* L90: */ + } + } else if (notrna && ! notrnb) { + for (l = *n; l >= 1; --l) { + for (k = *m; k >= 1; --k) { + i__1 = *m - k; +/* Computing MIN */ + i__2 = k + 1; +/* Computing MIN */ + i__3 = k + 1; + z__1 = zdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[ + min(i__3,*m) + l * c_dim1], &c__1); + suml.r = z__1.r, suml.i = z__1.i; + i__1 = *n - l; +/* Computing MIN */ + i__2 = l + 1; +/* Computing MIN */ + i__3 = l + 1; + z__1 = zdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[ + l + min(i__3,*n) * b_dim1], ldb); + sumr.r = z__1.r, sumr.i = z__1.i; + i__1 = k + l * c_dim1; + d_cnjg(&z__4, &sumr); + z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i; + z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; + z__1.r = c__[i__1].r - z__2.r, z__1.i = c__[i__1].i - z__2.i; + vec.r = z__1.r, vec.i = z__1.i; + scaloc = 1.; + i__1 = k + k * a_dim1; + d_cnjg(&z__3, &b[l + l * b_dim1]); + z__2.r = sgn * z__3.r, z__2.i = sgn * z__3.i; + z__1.r = a[i__1].r + z__2.r, z__1.i = a[i__1].i + z__2.i; + a11.r = z__1.r, a11.i = z__1.i; + da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( + d__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.; + da11 = smin; + *info = 1; + } + db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( + d__2)); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + z__3.r = scaloc, z__3.i = 0.; + z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * + z__3.i + vec.i * z__3.r; + z__1 = zladiv_(&z__2, &a11); + x11.r = z__1.r, x11.i = z__1.i; + if (scaloc != 1.) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L100: */ + } + *scale *= scaloc; + } + i__1 = k + l * c_dim1; + c__[i__1].r = x11.r, c__[i__1].i = x11.i; +/* L110: */ + } +/* L120: */ + } + } + return; +} diff --git a/relapack/src/ztrtri.c b/relapack/src/ztrtri.c new file mode 100644 index 000000000..ac9fe7bd4 --- /dev/null +++ b/relapack/src/ztrtri.c @@ -0,0 +1,107 @@ +#include "relapack.h" + +static void RELAPACK_ztrtri_rec(const char *, const char *, const int *, + double *, const int *, int *); + + +/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. + * + * This routine is functionally equivalent to LAPACK's ztrtri. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html + * */ +void RELAPACK_ztrtri( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (!nounit && !unit) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZTRTRI", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleandiag = nounit ? 'N' : 'U'; + + // check for singularity + if (nounit) { + int i; + for (i = 0; i < *n; i++) + if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { + *info = i; + return; + } + } + + // Recursive kernel + RELAPACK_ztrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); +} + + +/** ztrtri's recursive compute kernel */ +static void RELAPACK_ztrtri_rec( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) { + // Unblocked + LAPACK(ztrti2)(uplo, diag, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_ztrtri_rec(uplo, diag, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = - A_BL * A_TL + BLAS(ztrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); + // A_BL = A_BR \ A_BL + BLAS(ztrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TR = - A_TL * A_TR + BLAS(ztrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); + // A_TR = A_TR / A_BR + BLAS(ztrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_ztrtri_rec(uplo, diag, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} From b122413fb01aacff3112fdb387e74126539a8e5f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Jun 2017 18:13:14 +0200 Subject: [PATCH 12/42] Restore ReLAPACK test folder --- relapack/test/README.md | 48 ++++++++++++++ relapack/test/config.h | 13 ++++ relapack/test/lapack.h | 64 +++++++++++++++++++ relapack/test/test.h | 136 ++++++++++++++++++++++++++++++++++++++++ relapack/test/util.c | 116 ++++++++++++++++++++++++++++++++++ relapack/test/util.h | 15 +++++ relapack/test/xgbtrf.c | 43 +++++++++++++ relapack/test/xgemmt.c | 65 +++++++++++++++++++ relapack/test/xgetrf.c | 32 ++++++++++ relapack/test/xhegst.c | 32 ++++++++++ relapack/test/xhetrf.c | 40 ++++++++++++ relapack/test/xlauum.c | 25 ++++++++ relapack/test/xpbtrf.c | 40 ++++++++++++ relapack/test/xpotrf.c | 25 ++++++++ relapack/test/xsygst.c | 32 ++++++++++ relapack/test/xsytrf.c | 40 ++++++++++++ relapack/test/xtgsyl.c | 94 +++++++++++++++++++++++++++ relapack/test/xtrsyl.c | 65 +++++++++++++++++++ relapack/test/xtrtri.c | 25 ++++++++ 19 files changed, 950 insertions(+) create mode 100644 relapack/test/README.md create mode 100644 relapack/test/config.h create mode 100644 relapack/test/lapack.h create mode 100644 relapack/test/test.h create mode 100644 relapack/test/util.c create mode 100644 relapack/test/util.h create mode 100644 relapack/test/xgbtrf.c create mode 100644 relapack/test/xgemmt.c create mode 100644 relapack/test/xgetrf.c create mode 100644 relapack/test/xhegst.c create mode 100644 relapack/test/xhetrf.c create mode 100644 relapack/test/xlauum.c create mode 100644 relapack/test/xpbtrf.c create mode 100644 relapack/test/xpotrf.c create mode 100644 relapack/test/xsygst.c create mode 100644 relapack/test/xsytrf.c create mode 100644 relapack/test/xtgsyl.c create mode 100644 relapack/test/xtrsyl.c create mode 100644 relapack/test/xtrtri.c diff --git a/relapack/test/README.md b/relapack/test/README.md new file mode 100644 index 000000000..48434b3cf --- /dev/null +++ b/relapack/test/README.md @@ -0,0 +1,48 @@ +ReLAPACK Test Suite +=================== +This test suite compares ReLAPACK's recursive routines with LAPACK's compute +routines in terms of accuracy: For each test-case, we execute both ReLAPACK's +and LAPACK's routine on the same data and consider the numerical difference +between the two solutions. + +This difference is computed as the maximum error across all elements of the +routine's outputs, where the error for each element is the minimum of the +absolute error and the relative error (with LAPACK as the reference). If the +error is below the error bound configured in `config.h` (default: 1e-5 for +single precision and 1e-14 for double precision) the test-case is considered as +passed. + +For each routine the test-cases cover a variety of input argument combinations +to ensure that ReLAPACK's routines match the functionality of LAPACK for all use +cases. + +The matrix size for all experiments (default: 100) can also be specified in +`config.h`. + + +Implementation +-------------- +`test.h` provides the framework for our tests: It provides macros that allow to +generalize the tests for each operation in one file covering all data-types. +Such a file is structured as follows: + + * All matrices required by the test-cases are declared globally. For each + matrix, an array of two pointers is declared; one for the matrix copy passed + to ReLAPACK and one passed to LAPACK. + + * `tests()` contains the main control flow: it allocates (and later frees) the + copies of the globally declared matrices. It then defines the macro + `ROUTINE` to contain the name of the currently tested routine. + It then uses the macro `TEST` to perform the test-cases. + It receives the arguments of the routine, where matrices of which ReLAPACK + and LAPACK receive a copy are index with `i`. (Example: `TEST("L", &n, A[i], + &n, info);`) + + * The macro `TEST` first calls `pre()`, which initializes all relevant + matrices, then executes the ReLAPACK algorithm on the matrices with `i` = `0` + and then the LAPACK counter part with `i` = `1`. It then calls `post()`, + which computes the difference between the results, storing it in `error`. + Finally, the error is printed out and compared to the error bound. + +If all test-cases pass the error bound test, the program will have a `0` return +value, otherwise it is `1`, indicating an error. diff --git a/relapack/test/config.h b/relapack/test/config.h new file mode 100644 index 000000000..ab06a2fff --- /dev/null +++ b/relapack/test/config.h @@ -0,0 +1,13 @@ +#ifndef TEST_CONFIG_H +#define TEST_CONFIG_H + +// error bound for single and single complex routines +#define SINGLE_ERR_BOUND 1e-4 + +// error bound for double an double complex routines +#define DOUBLE_ERR_BOUND 1e-13 + +// size of test matrices +#define TEST_SIZE 100 + +#endif /* TEST_CONFIG_H */ diff --git a/relapack/test/lapack.h b/relapack/test/lapack.h new file mode 100644 index 000000000..80f5c419e --- /dev/null +++ b/relapack/test/lapack.h @@ -0,0 +1,64 @@ +#ifndef LAPACK_H2 +#define LAPACK_H2 + +#include "../config.h" + +void LAPACK(slauum)(const char *, const int *, float *, const int *, int *); +void LAPACK(dlauum)(const char *, const int *, double *, const int *, int *); +void LAPACK(clauum)(const char *, const int *, float *, const int *, int *); +void LAPACK(zlauum)(const char *, const int *, double *, const int *, int *); + +void LAPACK(strtri)(const char *, const char *, const int *, float *, const int *, int *); +void LAPACK(dtrtri)(const char *, const char *, const int *, double *, const int *, int *); +void LAPACK(ctrtri)(const char *, const char *, const int *, float *, const int *, int *); +void LAPACK(ztrtri)(const char *, const char *, const int *, double *, const int *, int *); + +void LAPACK(spotrf)(const char *, const int *, float *, const int *, int *); +void LAPACK(dpotrf)(const char *, const int *, double *, const int *, int *); +void LAPACK(cpotrf)(const char *, const int *, float *, const int *, int *); +void LAPACK(zpotrf)(const char *, const int *, double *, const int *, int *); + +void LAPACK(spbtrf)(const char *, const int *, const int *, float *, const int *, int *); +void LAPACK(dpbtrf)(const char *, const int *, const int *, double *, const int *, int *); +void LAPACK(cpbtrf)(const char *, const int *, const int *, float *, const int *, int *); +void LAPACK(zpbtrf)(const char *, const int *, const int *, double *, const int *, int *); + +void LAPACK(ssytrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(dsytrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void LAPACK(csytrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(chetrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(zsytrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void LAPACK(zhetrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void LAPACK(ssytrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(dsytrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void LAPACK(csytrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(chetrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(zsytrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void LAPACK(zhetrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); + +void LAPACK(sgetrf)(const int *, const int *, float *, const int *, int *, int *); +void LAPACK(dgetrf)(const int *, const int *, double *, const int *, int *, int *); +void LAPACK(cgetrf)(const int *, const int *, float *, const int *, int *, int *); +void LAPACK(zgetrf)(const int *, const int *, double *, const int *, int *, int *); + +void LAPACK(sgbtrf)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +void LAPACK(dgbtrf)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +void LAPACK(cgbtrf)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +void LAPACK(zgbtrf)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); + +void LAPACK(ssygst)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +void LAPACK(dsygst)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +void LAPACK(chegst)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +void LAPACK(zhegst)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); + +void LAPACK(strsyl)(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void LAPACK(dtrsyl)(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void LAPACK(ctrsyl)(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void LAPACK(ztrsyl)(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); + +void LAPACK(stgsyl)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); +void LAPACK(dtgsyl)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); +void LAPACK(ctgsyl)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); +void LAPACK(ztgsyl)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); + +#endif /* LAPACK_H2 */ diff --git a/relapack/test/test.h b/relapack/test/test.h new file mode 100644 index 000000000..24089f3a8 --- /dev/null +++ b/relapack/test/test.h @@ -0,0 +1,136 @@ +#ifndef TEST_H +#define TEST_H + +#include "../config.h" +#include "config.h" + +#if BLAS_UNDERSCORE +#define BLAS(routine) routine ## _ +#else +#define BLAS(routine) routine +#endif + +#if LAPACK_UNDERSCORE +#define LAPACK(routine) routine ## _ +#else +#define LAPACK(routine) routine +#endif + +#include "../inc/relapack.h" +#include "lapack.h" +#include "util.h" +#include +#include +#include + +// some name mangling macros +#define CAT(A, B) A ## B +#define XCAT(A, B) CAT(A, B) +#define XLAPACK(X) LAPACK(X) +#define XRELAPACK(X) XCAT(RELAPACK_, X) +#define STR(X) #X +#define XSTR(X) STR(X) + +// default setup and error computation names: pre() and post() +#define PRE pre +#define POST post + +// TEST macro: +// run setup (pre()), ReLAPACK routine (i = 0), LAPACK routine (i = 1), compute +// error (post()), check error bound, and print setup and error +#define TEST(...) \ + PRE(); \ + i = 0; \ + XRELAPACK(ROUTINE)(__VA_ARGS__); \ + i = 1; \ + XLAPACK(ROUTINE)(__VA_ARGS__); \ + POST(); \ + fail |= error > ERR_BOUND; \ + printf("%s(%s)\t%g\n", XSTR(ROUTINE), #__VA_ARGS__, error); + +// generalized datatype treatment: DT_PREFIX determines the type s, d, c, or z +#define XPREF(A) XCAT(DT_PREFIX, A) + +// matrix generation and error computation routines +#define x2matgen XPREF(2matgen) +#define x2vecerr XPREF(2vecerr) + +// error bounds +#define ERR_BOUND XPREF(ERR_BOUND_) +#define sERR_BOUND_ SINGLE_ERR_BOUND +#define dERR_BOUND_ DOUBLE_ERR_BOUND +#define cERR_BOUND_ SINGLE_ERR_BOUND +#define zERR_BOUND_ DOUBLE_ERR_BOUND + +// C datatypes +#define datatype XPREF(datatype_) +#define sdatatype_ float +#define ddatatype_ double +#define cdatatype_ float +#define zdatatype_ double + +// number of C datatype elements per element +#define x1 XPREF(DT_MULT) +#define sDT_MULT 1 +#define dDT_MULT 1 +#define cDT_MULT 2 +#define zDT_MULT 2 + +// typed allocations +#define xmalloc XPREF(malloc) +#define imalloc(S) malloc((S) * sizeof(int)) +#define smalloc(S) malloc((S) * sizeof(float)) +#define dmalloc(S) malloc((S) * sizeof(double)) +#define cmalloc(S) malloc((S) * 2 * sizeof(float)) +#define zmalloc(S) malloc((S) * 2 * sizeof(double)) + +// transpositions +#define xCTRANS XPREF(CTRANS) +#define sCTRANS "T" +#define dCTRANS "T" +#define cCTRANS "C" +#define zCTRANS "C" + +// some constants +#define MONE XPREF(MONE) +const float sMONE[] = { -1. }; +const double dMONE[] = { -1. }; +const float cMONE[] = { -1., 0. }; +const double zMONE[] = { -1., 0. }; + +#define ZERO XPREF(ZERO) +const float sZERO[] = { 0. }; +const double dZERO[] = { 0. }; +const float cZERO[] = { 0., 0. }; +const double zZERO[] = { 0., 0. }; + +#define ONE XPREF(ONE) +const float sONE[] = { 1. }; +const double dONE[] = { 1. }; +const float cONE[] = { 1., 0. }; +const double zONE[] = { 1., 0. }; + +const int iMONE[] = { -1 }; +const int iZERO[] = { 0 }; +const int iONE[] = { 1 }; +const int iTWO[] = { 2 }; +const int iTHREE[] = { 3 }; +const int iFOUR[] = { 4 }; + +void tests(); + +// global variables (used in tests(), pre(), and post()) +int i, n, n2, fail; +double error; + +int main(int argc, char* argv[]) { + n = TEST_SIZE; + n2 = (3 * n) / 4; + fail = 0; + + tests(); + + return fail; +} + +#endif /* TEST_H */ diff --git a/relapack/test/util.c b/relapack/test/util.c new file mode 100644 index 000000000..e0fca3eec --- /dev/null +++ b/relapack/test/util.c @@ -0,0 +1,116 @@ +#include "util.h" +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +/////////////////////// +// matrix generation // +/////////////////////// +// Each routine x2matgen is passed the size (m, n) of the desired matrix and +// geneartes two copies of such a matrix in in its output arguments A and B. +// The generated matrices is filled with random entries in [0, 1[ (+i*[0, 1[ in +// the complex case). Then m is added to the diagonal; this is numerically +// favorable for routines working with triangular and symmetric matrices. For +// the same reason the imaginary part of the diagonal is set to 0. + +void s2matgen(const int m, const int n, float *A, float *B) { + srand(time(NULL) + (size_t) A); + int i, j; + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) + A[i + m * j] = B[i + m * j] = (float) rand() / RAND_MAX + m * (i == j); +} + +void d2matgen(const int m, const int n, double *A, double *B) { + srand(time(NULL) + (size_t) A); + int i, j; + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) + A[i + m * j] = B[i + m * j] = (double) rand() / RAND_MAX + m * (i == j); +} + +void c2matgen(const int m, const int n, float *A, float *B) { + srand(time(NULL) + (size_t) A); + int i, j; + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) { + A[2* (i + m * j)] = B[2 * (i + m * j)] = (float) rand() / RAND_MAX + m * (i == j); + A[2* (i + m * j) + 1] = B[2 * (i + m * j) + 1] = ((float) rand() / RAND_MAX) * (i != j); + } +} + +void z2matgen(const int m, const int n, double *A, double *B) { + srand(time(NULL) + (size_t) A); + int i, j; + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) { + A[2* (i + m * j)] = B[2 * (i + m * j)] = (double) rand() / RAND_MAX + m * (i == j); + A[2* (i + m * j) + 1] = B[2 * (i + m * j) + 1] = ((double) rand() / RAND_MAX) * (i != j); + } +} + +//////////////////////// +// error computations // +//////////////////////// +// Each routine x2vecerrr is passed a vector lengh n and two vectors x and y. +// It returns the maximum of the element-wise error between these two vectors. +// This error is the minimum of the absolute difference and the relative +// differene with respect to y. + +double i2vecerr(const int n, const int *x, const int *y) { + double error = 0; + int i; + for (i = 0; i < n; i++) { + double nom = abs(x[i] - y[i]); + double den = abs(y[i]); + error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); + } + return error; +} + +double s2vecerr(const int n, const float *x, const float *y) { + float error = 0; + int i; + for (i = 0; i < n; i++) { + double nom = fabs((double) x[i] - y[i]); + double den = fabs(y[i]); + error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); + } + return error; +} + +double d2vecerr(const int n, const double *x, const double *y) { + double error = 0; + int i; + for (i = 0; i < n; i++) { + double nom = fabs(x[i] - y[i]); + double den = fabs(y[i]); + error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); + } + return error; +} + +double c2vecerr(const int n, const float *x, const float *y) { + double error = 0; + int i; + for (i = 0; i < n; i++) { + double nom = sqrt(((double) x[2 * i] - y[2 * i]) * ((double) x[2 * i] - y[2 * i]) + ((double) x[2 * i + 1] - y[2 * i + 1]) * ((double) x[2 * i + 1] - y[2 * i + 1])); + double den = sqrt((double) y[2 * i] * y[2 * i] + (double) y[2 * i + 1] * y[2 * i + 1]); + error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); + } + return error; +} + +double z2vecerr(const int n, const double *x, const double *y) { + double error = 0; + int i; + for (i = 0; i < n; i++) { + double nom = sqrt((x[2 * i] - y[2 * i]) * (x[2 * i] - y[2 * i]) + (x[2 * i + 1] - y[2 * i + 1]) * (x[2 * i + 1] - y[2 * i + 1])); + double den = sqrt(y[2 * i] * y[2 * i] + y[2 * i + 1] * y[2 * i + 1]); + error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); + } + return error; +} diff --git a/relapack/test/util.h b/relapack/test/util.h new file mode 100644 index 000000000..11d2999e0 --- /dev/null +++ b/relapack/test/util.h @@ -0,0 +1,15 @@ +#ifndef TEST_UTIL_H +#define TEST_UTIL_H + +void s2matgen(int, int, float *, float *); +void d2matgen(int, int, double *, double *); +void c2matgen(int, int, float *, float *); +void z2matgen(int, int, double *, double *); + +double i2vecerr(int, const int *, const int *); +double s2vecerr(int, const float *, const float *); +double d2vecerr(int, const double *, const double *); +double c2vecerr(int, const float *, const float *); +double z2vecerr(int, const double *, const double *); + +#endif /* TEST_UTIL_H */ diff --git a/relapack/test/xgbtrf.c b/relapack/test/xgbtrf.c new file mode 100644 index 000000000..f255006a5 --- /dev/null +++ b/relapack/test/xgbtrf.c @@ -0,0 +1,43 @@ +#include "test.h" + +datatype *A[2]; +int *ipiv[2], info; +int kl, ku, ld; + +void pre() { + int i; + x2matgen(ld, n, A[0], A[1]); + for (i = 0; i < n; i++) { + // set diagonal + A[0][x1 * (i + ld * i)] = + A[1][x1 * (i + ld * i)] = (datatype) rand() / RAND_MAX; + } + memset(ipiv[0], 0, n * sizeof(int)); + memset(ipiv[1], 0, n * sizeof(int)); +} + +void post() { + error = x2vecerr(ld * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); +} + +void tests() { + kl = n - 10; + ku = n; + ld = 2 * kl + ku + 1; + + A[0] = xmalloc(ld * n); + A[1] = xmalloc(ld * n); + ipiv[0] = imalloc(n); + ipiv[1] = imalloc(n); + + #define ROUTINE XPREF(gbtrf) + + TEST(&n, &n, &kl, &ku, A[i], &ld, ipiv[i], &info); + TEST(&n, &n2, &kl, &ku, A[i], &ld, ipiv[i], &info); + TEST(&n2, &n, &kl, &ku, A[i], &ld, ipiv[i], &info); + + free(A[0]); + free(A[1]); + free(ipiv[0]); + free(ipiv[1]); +} diff --git a/relapack/test/xgemmt.c b/relapack/test/xgemmt.c new file mode 100644 index 000000000..ffc37049d --- /dev/null +++ b/relapack/test/xgemmt.c @@ -0,0 +1,65 @@ +#include "test.h" + +datatype *A[2], *B[2], *C[2], *Ctmp; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + x2matgen(n, n, B[0], B[1]); + x2matgen(n, n, C[0], C[1]); +} + +void post() { + error = x2vecerr(n * n, C[0], C[1]); +} + +#define ROUTINE XPREF(gemmt) + +#define xlacpy XPREF(LAPACK(lacpy)) +#define xgemm XPREF(BLAS(gemm)) + +extern void xlacpy(const char *, const int *, const int *, const datatype *, const int *, datatype *, const int *); +extern void xgemm(const char *, const char *, const int *, const int *, const int *, const datatype *, const datatype *, const int *, const datatype *, const int *, const datatype *, const datatype *, const int*); + +void XLAPACK(ROUTINE)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const datatype *alpha, const datatype *A, const int *ldA, + const datatype *B, const int *ldB, + const datatype *beta, datatype *C, const int *ldC +) { + xlacpy(uplo, n, n, C, ldC, Ctmp, n); + xgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, Ctmp, n); + xlacpy(uplo, n, n, Ctmp, ldC, C, n); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + B[0] = xmalloc(n * n); + B[1] = xmalloc(n * n); + C[0] = xmalloc(n * n); + C[1] = xmalloc(n * n); + Ctmp = xmalloc(n * n); + + TEST("L", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("L", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, MONE, C[i], &n); + TEST("L", "N", "N", &n, &n, MONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("L", "N", "T", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("L", "T", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("L", "N", "N", &n, &n2, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("U", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("U", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, MONE, C[i], &n); + TEST("U", "N", "N", &n, &n, MONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("U", "N", "T", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("U", "T", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("U", "N", "N", &n, &n2, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + + free(A[0]); + free(A[1]); + free(B[0]); + free(B[1]); + free(C[0]); + free(C[1]); + free(Ctmp); +} diff --git a/relapack/test/xgetrf.c b/relapack/test/xgetrf.c new file mode 100644 index 000000000..4484a24af --- /dev/null +++ b/relapack/test/xgetrf.c @@ -0,0 +1,32 @@ +#include "test.h" + +datatype *A[2]; +int *ipiv[2], info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + memset(ipiv[0], 0, n * sizeof(int)); + memset(ipiv[1], 0, n * sizeof(int)); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + ipiv[0] = imalloc(n); + ipiv[1] = imalloc(n); + + #define ROUTINE XPREF(getrf) + + TEST(&n, &n, A[i], &n, ipiv[i], &info); + TEST(&n, &n2, A[i], &n, ipiv[i], &info); + TEST(&n2, &n, A[i], &n, ipiv[i], &info); + + free(A[0]); + free(A[1]); + free(ipiv[0]); + free(ipiv[1]); +} diff --git a/relapack/test/xhegst.c b/relapack/test/xhegst.c new file mode 100644 index 000000000..c318ef546 --- /dev/null +++ b/relapack/test/xhegst.c @@ -0,0 +1,32 @@ +#include "test.h" + +datatype *A[2], *B[2]; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + x2matgen(n, n, B[0], B[1]); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + B[0] = xmalloc(n * n); + B[1] = xmalloc(n * n); + + #define ROUTINE XPREF(hegst) + + TEST(iONE, "L", &n, A[i], &n, B[i], &n, &info); + TEST(iONE, "U", &n, A[i], &n, B[i], &n, &info); + TEST(iTWO, "L", &n, A[i], &n, B[i], &n, &info); + TEST(iTWO, "U", &n, A[i], &n, B[i], &n, &info); + + free(A[0]); + free(A[1]); + free(B[0]); + free(B[1]); +} diff --git a/relapack/test/xhetrf.c b/relapack/test/xhetrf.c new file mode 100644 index 000000000..b5d54bdff --- /dev/null +++ b/relapack/test/xhetrf.c @@ -0,0 +1,40 @@ +#include "test.h" + +datatype *A[2], *Work; +int *ipiv[2], info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + memset(ipiv[0], 0, n * sizeof(int)); + memset(ipiv[1], 0, n * sizeof(int)); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); +} + +void tests() { + const int lWork = n * n; + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + ipiv[0] = imalloc(n); + ipiv[1] = imalloc(n); + Work = xmalloc(lWork); + + #define ROUTINE XPREF(hetrf) + + TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + + #undef ROUTINE + #define ROUTINE XPREF(hetrf_rook) + + TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + + free(A[0]); + free(A[1]); + free(ipiv[0]); + free(ipiv[1]); + free(Work); +} diff --git a/relapack/test/xlauum.c b/relapack/test/xlauum.c new file mode 100644 index 000000000..d2c42fa01 --- /dev/null +++ b/relapack/test/xlauum.c @@ -0,0 +1,25 @@ +#include "test.h" + +datatype *A[2]; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + + #define ROUTINE XPREF(lauum) + + TEST("L", &n, A[i], &n, &info); + TEST("U", &n, A[i], &n, &info); + + free(A[0]); + free(A[1]); +} diff --git a/relapack/test/xpbtrf.c b/relapack/test/xpbtrf.c new file mode 100644 index 000000000..9a9babb6b --- /dev/null +++ b/relapack/test/xpbtrf.c @@ -0,0 +1,40 @@ +#include "test.h" + +datatype *A[2]; +int info[2]; +int n; + +void pre() { + int i; + x2matgen(n, n, A[0], A[1]); + for (i = 0; i < n; i++) { + // set diagonal + A[0][x1 * (i + n * i)] = + A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX; + // set first row + A[0][x1 * (n * i)] = + A[1][x1 * (n * i)] = (datatype) rand() / RAND_MAX + n; + } +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + + #define ROUTINE XPREF(pbtrf) + + const int + kd1 = n / 4, + kd2 = n * 3 / 4; + TEST("L", &n, &kd1, A[i], &n, &info[i]); + TEST("L", &n, &kd2, A[i], &n, &info[i]); + TEST("U", &n, &kd1, A[i] - x1 * kd1, &n, &info[i]); + TEST("U", &n, &kd2, A[i] - x1 * kd2, &n, &info[i]); + + free(A[0]); + free(A[1]); +} diff --git a/relapack/test/xpotrf.c b/relapack/test/xpotrf.c new file mode 100644 index 000000000..5e04d426f --- /dev/null +++ b/relapack/test/xpotrf.c @@ -0,0 +1,25 @@ +#include "test.h" + +datatype *A[2]; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + + #define ROUTINE XPREF(potrf) + + TEST("L", &n, A[i], &n, &info); + TEST("U", &n, A[i], &n, &info); + + free(A[0]); + free(A[1]); +} diff --git a/relapack/test/xsygst.c b/relapack/test/xsygst.c new file mode 100644 index 000000000..b473a5919 --- /dev/null +++ b/relapack/test/xsygst.c @@ -0,0 +1,32 @@ +#include "test.h" + +datatype *A[2], *B[2]; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + x2matgen(n, n, B[0], B[1]); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + B[0] = xmalloc(n * n); + B[1] = xmalloc(n * n); + + #define ROUTINE XPREF(sygst) + + TEST(iONE, "L", &n, A[i], &n, B[i], &n, &info); + TEST(iONE, "U", &n, A[i], &n, B[i], &n, &info); + TEST(iTWO, "L", &n, A[i], &n, B[i], &n, &info); + TEST(iTWO, "U", &n, A[i], &n, B[i], &n, &info); + + free(A[0]); + free(A[1]); + free(B[0]); + free(B[1]); +} diff --git a/relapack/test/xsytrf.c b/relapack/test/xsytrf.c new file mode 100644 index 000000000..82d626f6f --- /dev/null +++ b/relapack/test/xsytrf.c @@ -0,0 +1,40 @@ +#include "test.h" + +datatype *A[2], *Work; +int *ipiv[2], info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + memset(ipiv[0], 0, n * sizeof(int)); + memset(ipiv[1], 0, n * sizeof(int)); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); +} + +void tests() { + const int lWork = n * n; + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + ipiv[0] = imalloc(n); + ipiv[1] = imalloc(n); + Work = xmalloc(lWork); + + #define ROUTINE XPREF(sytrf) + + TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + + #undef ROUTINE + #define ROUTINE XPREF(sytrf_rook) + + TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + + free(A[0]); + free(A[1]); + free(ipiv[0]); + free(ipiv[1]); + free(Work); +} diff --git a/relapack/test/xtgsyl.c b/relapack/test/xtgsyl.c new file mode 100644 index 000000000..74db5005e --- /dev/null +++ b/relapack/test/xtgsyl.c @@ -0,0 +1,94 @@ +#include "test.h" + +datatype *A[2], *B[2], *C[2], *D[2], *E[2], *F[2], *Work, scale[2], dif[2]; +int *iWork, lWork, info; + +#define xlascl XPREF(LAPACK(lascl)) +void xlascl(const char *, const int *, const int *, const datatype *, const + datatype *, const int *, const int *, datatype *, const int *, int *); + +#define xscal XPREF(LAPACK(scal)) +void xscal(const int *, const datatype *, datatype *, const int *); + +void pre() { + int i; + + x2matgen(n, n, A[0], A[1]); + x2matgen(n, n, B[0], B[1]); + x2matgen(n, n, C[0], C[1]); + x2matgen(n, n, D[0], D[1]); + x2matgen(n, n, E[0], E[1]); + x2matgen(n, n, F[0], F[1]); + + for (i = 0; i < n; i++) { + // set diagonal + A[0][x1 * (i + n * i)] = + A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX; + E[0][x1 * (i + n * i)] = + E[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX; + // clear first subdiagonal + A[0][x1 * (i + 1 + n * i)] = + A[1][x1 * (i + 1 + n * i)] = + B[0][x1 * (i + 1 + n * i)] = + B[1][x1 * (i + 1 + n * i)] = + A[0][x1 * (i + 1 + n * i) + x1 - 1] = + A[1][x1 * (i + 1 + n * i) + x1 - 1] = + B[0][x1 * (i + 1 + n * i) + x1 - 1] = + B[1][x1 * (i + 1 + n * i) + x1 - 1] = 0; + } +} + + +void post() { + if (scale[0] != 1 || scale[0] != 1) + printf("scale[RELAPACK] = %12g\tscale[LAPACK] = %12g\n", scale[0], scale[1]); + if (scale[0]) { + xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, C[0], &n, &info); + xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, F[0], &n, &info); + } + error = x2vecerr(n * n, C[0], C[1]) + x2vecerr(n * n, F[0], F[1]); +} + +void tests() { + lWork = 2 * n * n; + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + B[0] = xmalloc(n * n); + B[1] = xmalloc(n * n); + C[0] = xmalloc(n * n); + C[1] = xmalloc(n * n); + D[0] = xmalloc(n * n); + D[1] = xmalloc(n * n); + E[0] = xmalloc(n * n); + E[1] = xmalloc(n * n); + F[0] = xmalloc(n * n); + F[1] = xmalloc(n * n); + Work = xmalloc(lWork); + iWork = imalloc(n + n + 2); + + #define ROUTINE XPREF(tgsyl) + + TEST("N", iZERO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iZERO, &n2, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iZERO, &n, &n2, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iTWO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iTHREE, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iFOUR, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST(xCTRANS, iZERO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + + free(A[0]); + free(A[1]); + free(B[0]); + free(B[1]); + free(C[0]); + free(C[1]); + free(D[0]); + free(D[1]); + free(E[0]); + free(E[1]); + free(F[0]); + free(F[1]); + free(Work); + free(iWork); +} diff --git a/relapack/test/xtrsyl.c b/relapack/test/xtrsyl.c new file mode 100644 index 000000000..358a89242 --- /dev/null +++ b/relapack/test/xtrsyl.c @@ -0,0 +1,65 @@ +#include "test.h" + +datatype *A[2], *B[2], *C[2], *Work, scale[2]; +int info; + +#define xlascl XPREF(LAPACK(lascl)) +void xlascl(const char *, const int *, const int *, const datatype *, const + datatype *, const int *, const int *, datatype *, const int *, int *); + +void pre() { + int i; + + x2matgen(n, n, A[0], A[1]); + x2matgen(n, n, B[0], B[1]); + x2matgen(n, n, C[0], C[1]); + + for (i = 0; i < n; i++) { + // set diagonal + A[0][x1 * (i + n * i)] = + A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX; + // clear first subdiagonal + A[0][x1 * (i + 1 + n * i)] = + A[1][x1 * (i + 1 + n * i)] = + B[0][x1 * (i + 1 + n * i)] = + B[1][x1 * (i + 1 + n * i)] = + A[0][x1 * (i + 1 + n * i) + x1 - 1] = + A[1][x1 * (i + 1 + n * i) + x1 - 1] = + B[0][x1 * (i + 1 + n * i) + x1 - 1] = + B[1][x1 * (i + 1 + n * i) + x1 - 1] = 0; + } +} + +void post() { + if (scale[0] != 1 || scale[0] != 1) + printf("scale[RELAPACK] = %12g\tscale[LAPACK] = %12g\n", scale[0], scale[1]); + if (scale[0]) + xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, C[0], &n, &info); + error = x2vecerr(n * n, C[0], C[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + B[0] = xmalloc(n * n); + B[1] = xmalloc(n * n); + C[0] = xmalloc(n * n); + C[1] = xmalloc(n * n); + + #define ROUTINE XPREF(trsyl) + + TEST("N", "N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("N", "N", iONE, &n2, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("N", "N", iONE, &n, &n2, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("C", "N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("N", "C", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("C", "C", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("N", "N", iMONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + + free(A[0]); + free(A[1]); + free(B[0]); + free(B[1]); + free(C[0]); + free(C[1]); +} diff --git a/relapack/test/xtrtri.c b/relapack/test/xtrtri.c new file mode 100644 index 000000000..106391bc8 --- /dev/null +++ b/relapack/test/xtrtri.c @@ -0,0 +1,25 @@ +#include "test.h" + +datatype *A[2]; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + + #define ROUTINE XPREF(trtri) + + TEST("L", "N", &n, A[i], &n, &info); + TEST("U", "N", &n, A[i], &n, &info); + + free(A[0]); + free(A[1]); +} From 912410f2140da59c0b639110fd6ac6ed495e8bec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Jun 2017 18:15:21 +0200 Subject: [PATCH 13/42] Add ReLAPACK to Makefiles --- Makefile | 20 ++++++++++++++------ Makefile.rule | 5 ++++- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 27923aa9b..499732d16 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,11 @@ ifneq ($(NO_LAPACK), 1) SUBDIRS += lapack endif +RELA = +ifneq ($(BUILD_RELAPACK), 0) +RELA = re_lapack +endif + LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench @@ -23,7 +28,7 @@ SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench .PHONY : all libs netlib test ctest shared install .NOTPARALLEL : all libs prof lapack-test install blas-test -all :: libs netlib tests shared +all :: libs netlib $(RELA) tests shared @echo @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" @echo @@ -215,6 +220,14 @@ ifndef NO_LAPACKE endif endif +ifeq ($(NO_LAPACK), 1) +re_lapack : + +else +re_lapack : + @$(MAKE) -C relapack +endif + prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof @@ -329,8 +342,3 @@ endif @rm -f *.grd Makefile.conf_last config_last.h @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) @echo Done. - -# Makefile debugging trick: -# call print-VARIABLE to see the runtime value of any variable -print-%: - @echo '$*=$($*)' diff --git a/Makefile.rule b/Makefile.rule index b6c22f798..2866699be 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -83,6 +83,9 @@ VERSION = 0.2.20.dev # Build LAPACK Deprecated functions since LAPACK 3.6.0 BUILD_LAPACK_DEPRECATED = 1 +# Build RecursiveLAPACK on top of LAPACK +BUILD_RELAPACK = 1 + # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1 NO_WARMUP = 1 # If you want to disable CPU/Memory affinity on Linux. -NO_AFFINITY = 1 +#NO_AFFINITY = 1 # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # BIGNUMA = 1 From 0d5c8e53861adbb54ee9952ed76c2e8dd029ca96 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 12:43:13 +0530 Subject: [PATCH 14/42] arm: Determine the abi from compiler if not specified on command line If ARM abi is not explicitly mentioned on the command line, then set the arm abi to softfp or hard according to the compiler environment. This assumes that compiler sets the defines __ARM_PCS and __ARM_PCS_VFP accordingly. --- Makefile.arm | 23 +++-------------------- Makefile.system | 24 ++++++++++++++++-------- c_check | 16 +++++++++++++++- common_arm.h | 5 ----- 4 files changed, 34 insertions(+), 34 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index c189b0c47..eedd39b73 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,5 +1,4 @@ -#ifeq logical or -ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) +ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) CCOMMON_OPT += -mfpu=neon -march=armv7-a FCOMMON_OPT += -mfpu=neon -march=armv7-a @@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a endif endif -ifeq ($(CORE), ARMV7) -ifeq ($(OSNAME), Android) -ifeq ($(ARM_SOFTFP_ABI), 1) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a -else -CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch -FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch -endif -else -CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a -FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a -endif -endif - ifeq ($(CORE), ARMV6) CCOMMON_OPT += -mfpu=vfp -march=armv6 FCOMMON_OPT += -mfpu=vfp -march=armv6 endif - ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -marm -march=armv5 -FCOMMON_OPT += -marm -march=armv5 +CCOMMON_OPT += -march=armv5 +FCOMMON_OPT += -march=armv5 endif diff --git a/Makefile.system b/Makefile.system index 29d3efd53..2cae5f1c9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -242,6 +242,10 @@ EXTRALIB += -lm NO_EXPRECISION = 1 endif +ifeq ($(OSNAME), Android) +EXTRALIB += -lm +endif + ifeq ($(OSNAME), AIX) EXTRALIB += -lm endif @@ -483,16 +487,20 @@ ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 -CCOMMON_OPT += -marm -FCOMMON_OPT += -marm - -ifeq ($(ARM_SOFTFP_ABI), 1) -CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI -FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI +# If ABI is specified on command line use it. Else use the automatically detected ABI. +ifeq ($(ARM_SOFTFP_ABI),1) +ARM_ABI = softfp else -CCOMMON_OPT += -mfloat-abi=hard -FCOMMON_OPT += -mfloat-abi=hard +ifeq ($(ARM_HARD_ABI),1) +ARM_ABI = hard +else +ARM_ABI=$(ARM_ABI_AUTO) endif +endif +export ARM_ABI_AUTO +CCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) +FCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) + endif ifeq ($(ARCH), arm64) diff --git a/c_check b/c_check index 20da288be..2e7e08cfb 100644 --- a/c_check +++ b/c_check @@ -94,7 +94,17 @@ if ($architecture eq "mips64") { $defined = 1; } -if (($architecture eq "arm") || ($architecture eq "arm64")) { +if ($architecture eq "arm") { + $defined = 1; + $data = `$compiler_name -dM -E ctest2.c | grep -w __ARM_PCS_VFP`; + if ($data ne "") { + $abi = "hard"; + } else { + $abi = "softfp"; + } +} + +if ($architecture eq "arm64") { $defined = 1; } @@ -287,6 +297,10 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; +if ($architecture eq "arm") { + print MAKEFILE "ARM_ABI_AUTO=$abi\n"; +} + $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; $compiler =~ tr/[a-z]/[A-Z]/; diff --git a/common_arm.h b/common_arm.h index a17acb448..27fa76b76 100644 --- a/common_arm.h +++ b/common_arm.h @@ -111,11 +111,6 @@ REALNAME: #define PROFCODE -#ifdef __ARM_PCS -//-mfloat-abi=softfp -#define SOFT_FLOAT_ABI -#endif - #endif From da7f0ff425a4ccd9a4f32b7fff33b9ef807ad0f4 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 12:46:18 +0530 Subject: [PATCH 15/42] generic: add some generic gemm and trmm kernels Added generic 4x4 and 4x2 gemm kernels Added generic 4x2 trmm kernel --- kernel/generic/gemmkernel_4x2.c | 317 ++++++++++++++++++ kernel/generic/gemmkernel_4x4.c | 571 ++++++++++++++++++++++++++++++++ kernel/generic/trmmkernel_4x2.c | 528 +++++++++++++++++++++++++++++ 3 files changed, 1416 insertions(+) create mode 100644 kernel/generic/gemmkernel_4x2.c create mode 100644 kernel/generic/gemmkernel_4x4.c create mode 100644 kernel/generic/trmmkernel_4x2.c diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c new file mode 100644 index 000000000..1d15de1d7 --- /dev/null +++ b/kernel/generic/gemmkernel_4x2.c @@ -0,0 +1,317 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + + for (j=0; j<(bn/2); j+=2) + { + C0 = C; + C1 = C0+ldc; + + ptrba = ba; + + for (i=0; i + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + + + for (j=0; j + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + for (j=0; j<(bn/2); j+=2) // do the Mx2 loops + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + + ptrba = ba; + + for (i=0; i Date: Fri, 30 Jun 2017 13:06:38 +0530 Subject: [PATCH 16/42] arm: Use assembly implementations based on the ARM abi In case of softfp abi, assembly implementations of only those APIs are used which doesnt have a floating point argument or return value. In case of hard abi, all assembly implementations are used. --- kernel/arm/KERNEL.ARMV6 | 112 +++++++++++++++-------------------- kernel/arm/KERNEL.ARMV7 | 125 +++++----------------------------------- 2 files changed, 60 insertions(+), 177 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 16bde105b..a2dd4806d 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,7 +1,5 @@ +include $(KERNELDIR)/KERNEL.ARMV5 - - -############################################################################### SAMAXKERNEL = iamax_vfp.S DAMAXKERNEL = iamax_vfp.S CAMAXKERNEL = iamax_vfp.S @@ -34,6 +32,45 @@ IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S +SGEMMKERNEL = ../generic/gemmkernel_4x2.c +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = sgemm_ncopy_4_vfp.S +SGEMMITCOPY = sgemm_tcopy_4_vfp.S +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = sgemm_ncopy_2_vfp.S +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_4x2.c +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = dgemm_ncopy_4_vfp.S +DGEMMITCOPY = dgemm_tcopy_4_vfp.S +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif +DGEMMONCOPY = dgemm_ncopy_2_vfp.S +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +STRMMKERNEL = ../generic/trmmkernel_4x2.c +DTRMMKERNEL = ../generic/trmmkernel_4x2.c + +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +ifeq ($(ARM_ABI),hard) + SASUMKERNEL = asum_vfp.S DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S @@ -44,11 +81,6 @@ DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c - SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S @@ -64,11 +96,6 @@ DROTKERNEL = rot_vfp.S CROTKERNEL = rot_vfp.S ZROTKERNEL = rot_vfp.S -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S @@ -84,63 +111,14 @@ DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S -STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S -ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S +STRMMKERNEL = strmm_kernel_4x2_vfp.S +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S -SGEMMINCOPY = sgemm_ncopy_4_vfp.S -SGEMMITCOPY = sgemm_tcopy_4_vfp.S -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPY = sgemm_ncopy_2_vfp.S -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - DGEMMKERNEL = dgemm_kernel_4x2_vfp.S -DGEMMINCOPY = dgemm_ncopy_4_vfp.S -DGEMMITCOPY = dgemm_tcopy_4_vfp.S -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPY = dgemm_ncopy_2_vfp.S -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - CGEMMKERNEL = cgemm_kernel_2x2_vfp.S -CGEMMONCOPY = cgemm_ncopy_2_vfp.S -CGEMMOTCOPY = cgemm_tcopy_2_vfp.S -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S -ZGEMMONCOPY = zgemm_ncopy_2_vfp.S -ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - +endif diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d5cd94fbd..d4829faa3 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,86 +1,29 @@ +include $(KERNELDIR)/KERNEL.ARMV6 -################################################################################# -SAMAXKERNEL = iamax_vfp.S -DAMAXKERNEL = iamax_vfp.S -CAMAXKERNEL = iamax_vfp.S -ZAMAXKERNEL = iamax_vfp.S +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_4x4.c -SAMINKERNEL = iamax_vfp.S -DAMINKERNEL = iamax_vfp.S -CAMINKERNEL = iamax_vfp.S -ZAMINKERNEL = iamax_vfp.S +SGEMMKERNEL = ../generic/gemmkernel_4x4.c +SGEMMONCOPY = sgemm_ncopy_4_vfp.S +SGEMMOTCOPY = sgemm_tcopy_4_vfp.S +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o -SMAXKERNEL = iamax_vfp.S -DMAXKERNEL = iamax_vfp.S +DGEMMKERNEL = ../generic/gemmkernel_4x4.c +DGEMMONCOPY = dgemm_ncopy_4_vfp.S +DGEMMOTCOPY = dgemm_tcopy_4_vfp.S +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o -SMINKERNEL = iamax_vfp.S -DMINKERNEL = iamax_vfp.S - -ISAMAXKERNEL = iamax_vfp.S -IDAMAXKERNEL = iamax_vfp.S -ICAMAXKERNEL = iamax_vfp.S -IZAMAXKERNEL = iamax_vfp.S - -ISAMINKERNEL = iamax_vfp.S -IDAMINKERNEL = iamax_vfp.S -ICAMINKERNEL = iamax_vfp.S -IZAMINKERNEL = iamax_vfp.S - -ISMAXKERNEL = iamax_vfp.S -IDMAXKERNEL = iamax_vfp.S - -ISMINKERNEL = iamax_vfp.S -IDMINKERNEL = iamax_vfp.S - -SSWAPKERNEL = swap_vfp.S -DSWAPKERNEL = swap_vfp.S -CSWAPKERNEL = swap_vfp.S -ZSWAPKERNEL = swap_vfp.S - -SASUMKERNEL = asum_vfp.S -DASUMKERNEL = asum_vfp.S -CASUMKERNEL = asum_vfp.S -ZASUMKERNEL = asum_vfp.S - -SAXPYKERNEL = axpy_vfp.S -DAXPYKERNEL = axpy_vfp.S -CAXPYKERNEL = axpy_vfp.S -ZAXPYKERNEL = axpy_vfp.S - -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c - -SDOTKERNEL = sdot_vfp.S -DDOTKERNEL = ddot_vfp.S -CDOTKERNEL = cdot_vfp.S -ZDOTKERNEL = zdot_vfp.S +ifeq ($(ARM_ABI),hard) SNRM2KERNEL = nrm2_vfpv3.S DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S -SROTKERNEL = rot_vfp.S -DROTKERNEL = rot_vfp.S -CROTKERNEL = rot_vfp.S -ZROTKERNEL = rot_vfp.S - -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c - SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -CGEMVNKERNEL = cgemv_n_vfp.S -ZGEMVNKERNEL = zgemv_n_vfp.S - -SGEMVTKERNEL = gemv_t_vfp.S -DGEMVTKERNEL = gemv_t_vfp.S -CGEMVTKERNEL = cgemv_t_vfp.S -ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S @@ -88,47 +31,9 @@ CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S -SGEMMONCOPY = sgemm_ncopy_4_vfp.S -SGEMMOTCOPY = sgemm_tcopy_4_vfp.S -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S -DGEMMONCOPY = dgemm_ncopy_4_vfp.S -DGEMMOTCOPY = dgemm_tcopy_4_vfp.S -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S -CGEMMONCOPY = cgemm_ncopy_2_vfp.S -CGEMMOTCOPY = cgemm_tcopy_2_vfp.S -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ZGEMMONCOPY = zgemm_ncopy_2_vfp.S -ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - +endif From aa5edebc80200a590362dc2229e2751d399c04aa Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 13:12:05 +0530 Subject: [PATCH 17/42] arm: add softfp support in kernel/arm/asum_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/asum_vfp.S | 8 ++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index a2dd4806d..1da6e4e1f 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -32,6 +32,11 @@ IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -71,11 +76,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SASUMKERNEL = asum_vfp.S -DASUMKERNEL = asum_vfp.S -CASUMKERNEL = asum_vfp.S -ZASUMKERNEL = asum_vfp.S - SAXPYKERNEL = axpy_vfp.S DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S index fe6242a5b..5b08e5028 100644 --- a/kernel/arm/asum_vfp.S +++ b/kernel/arm/asum_vfp.S @@ -475,6 +475,14 @@ asum_kernel_L999: vadd.f32 s0 , s0, s1 // set return value #endif +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif +#endif + bx lr EPILOGUE From 4f0773f07d07b9adad103e66d7b3abae108d9d31 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 20:06:29 +0530 Subject: [PATCH 18/42] arm: add softfp support in kernel/arm/axpy_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++--- kernel/arm/axpy_vfp.S | 71 ++++++++++++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 1da6e4e1f..63867d2b7 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -37,6 +37,11 @@ DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -76,11 +81,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SAXPYKERNEL = axpy_vfp.S -DAXPYKERNEL = axpy_vfp.S -CAXPYKERNEL = axpy_vfp.S -ZAXPYKERNEL = axpy_vfp.S - SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index 8e5334f62..a407b04bd 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard -#define OLD_INC_X [fp, #0 ] -#define OLD_Y [fp, #4 ] -#define OLD_INC_Y [fp, #8 ] -#else +#if !defined(__ARM_PCS_VFP) + +#if !defined(COMPLEX) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 #define OLD_X [fp, #0 ] #define OLD_INC_X [fp, #4 ] #define OLD_Y [fp, #8 ] #define OLD_INC_Y [fp, #12 ] +#else +#define OLD_ALPHA [fp, #0] +#define OLD_X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define OLD_Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] #endif - + +#else //COMPLEX + +#if !defined(DOUBLE) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define OLD_Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#else +#define OLD_ALPHAR [fp, #0] +#define OLD_ALPHAI [fp, #8] +#define OLD_X [fp, #16 ] +#define OLD_INC_X [fp, #20 ] +#define OLD_Y [fp, #24 ] +#define OLD_INC_Y [fp, #28 ] +#endif + +#endif //!defined(COMPLEX) + +#else //__ARM_PCS_VFP + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + +#endif //!defined(__ARM_PCS_VFP) + #define N r0 #define Y r1 #define INC_X r2 @@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #8 sub sp, sp, #STACKSIZE // reserve stack -#ifdef ARM_SOFTFP_ABI -#ifndef DOUBLE - vmov s0, r3 //move alpha to s0 +#if !defined(__ARM_PCS_VFP) +#if !defined(COMPLEX) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA + ldr X, OLD_X +#else + vldr d0, OLD_ALPHA ldr X, OLD_X #endif +#else //COMPLEX +#if !defined(DOUBLE) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr X, OLD_X +#else + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr X, OLD_X #endif - +#endif +#endif + ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y From 0150fabdb6250748bc45d18ccbb782331526c5cd Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 21:52:32 +0530 Subject: [PATCH 19/42] arm: add softfp support in kernel/arm/rot_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/rot_vfp.S | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 63867d2b7..e9fe6bedd 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -42,6 +42,11 @@ DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -91,11 +96,6 @@ DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S ZNRM2KERNEL = nrm2_vfp.S -SROTKERNEL = rot_vfp.S -DROTKERNEL = rot_vfp.S -CROTKERNEL = rot_vfp.S -ZROTKERNEL = rot_vfp.S - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index d053423b6..6e679ecf9 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OLD_INC_Y [fp, #0 ] +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) +#define OLD_C [fp, #4] +#define OLD_S [fp, #8] +#else +#define OLD_C [fp, #8] +#define OLD_S [fp, #16] +#endif +#endif #define N r0 #define X r1 @@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #8 ldr INC_Y , OLD_INC_Y - +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vldr s0, OLD_C + vldr s1, OLD_S +#else + vldr d0, OLD_C + vldr d1, OLD_S +#endif +#endif cmp N, #0 ble rot_kernel_L999 From 54915ce343dba7e00dba21c73b9fd35bcded0de3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 23:46:02 +0530 Subject: [PATCH 20/42] arm: add softfp support in kernel/arm/*dot_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/cdot_vfp.S | 32 ++++++++++++++++++++++++++------ kernel/arm/ddot_vfp.S | 3 +++ kernel/arm/sdot_vfp.S | 13 ++++++------- kernel/arm/zdot_vfp.S | 32 ++++++++++++++++++++++++++------ 5 files changed, 66 insertions(+), 24 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index e9fe6bedd..022547c9b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -47,6 +47,11 @@ DROTKERNEL = rot_vfp.S CROTKERNEL = rot_vfp.S ZROTKERNEL = rot_vfp.S +SDOTKERNEL = sdot_vfp.S +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -86,11 +91,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SDOTKERNEL = sdot_vfp.S -DDOTKERNEL = ddot_vfp.S -CDOTKERNEL = cdot_vfp.S -ZDOTKERNEL = zdot_vfp.S - SNRM2KERNEL = nrm2_vfp.S DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index 0497b6d83..e5a6e4d35 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r0 #define X r1 #define INC_X r2 -#define OLD_Y r3 - /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define OLD_INC_Y [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_RETURN_ADDR r0 +#define OLD_N r1 +#define OLD_X r2 +#define OLD_INC_X r3 +#define OLD_Y [fp, #0 ] +#define OLD_INC_Y [fp, #4 ] +#define RETURN_ADDR r8 +#else +#define OLD_Y r3 +#define OLD_INC_Y [fp, #0 ] +#endif #define I r5 #define Y r6 @@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4 - r9, fp} - add fp, sp, #24 + add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 @@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmov s2, s0 vmov s3, s0 +#if !defined(__ARM_PCS_VFP) + mov RETURN_ADDR, OLD_RETURN_ADDR + mov N, OLD_N + mov X, OLD_X + mov INC_X, OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y, OLD_INC_Y +#else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y +#endif cmp N, #0 ble cdot_kernel_L999 @@ -265,7 +283,6 @@ cdot_kernel_S10: cdot_kernel_L999: - sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers @@ -276,8 +293,11 @@ cdot_kernel_L999: vadd.f32 s0 , s0, s2 vsub.f32 s1 , s1, s3 #endif +#if !defined(__ARM_PCS_VFP) + vstm RETURN_ADDR, {s0 - s1} +#endif - sub sp, fp, #24 + sub sp, fp, #28 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index f28acbae3..fb294d8b4 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -246,6 +246,9 @@ ddot_kernel_L999: vldm r3, { d8 - d15} // restore floating point registers vadd.f64 d0 , d0, d1 // set return value +#if !defined(__ARM_PCS_VFP) + vmov r0, r1, d0 +#endif sub sp, fp, #24 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index f3abdc197..5f4f424bf 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -329,20 +329,19 @@ sdot_kernel_L999: vldm r3, { s8 - s15} // restore floating point registers #if defined(DSDOT) - vadd.f64 d0 , d0, d1 // set return value - -#ifdef ARM_SOFTFP_ABI - vmov r0, r1, d0 +#else + vadd.f32 s0 , s0, s1 // set return value #endif +#if !defined(__ARM_PCS_VFP) +#if defined(DSDOT) + vmov r0, r1, d0 #else - - vadd.f32 s0 , s0, s1 // set return value -#ifdef ARM_SOFTFP_ABI vmov r0, s0 #endif #endif + sub sp, fp, #24 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index 936ce9f60..43f2c0c0b 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r0 #define X r1 #define INC_X r2 -#define OLD_Y r3 - /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define OLD_INC_Y [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_RETURN_ADDR r0 +#define OLD_N r1 +#define OLD_X r2 +#define OLD_INC_X r3 +#define OLD_Y [fp, #0 ] +#define OLD_INC_Y [fp, #4 ] +#define RETURN_ADDR r8 +#else +#define OLD_Y r3 +#define OLD_INC_Y [fp, #0 ] +#endif #define I r5 #define Y r6 @@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4 - r9, fp} - add fp, sp, #24 + add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 @@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vcvt.f64.f32 d2, s0 vcvt.f64.f32 d3, s0 +#if !defined(__ARM_PCS_VFP) + mov RETURN_ADDR, OLD_RETURN_ADDR + mov N, OLD_N + mov X, OLD_X + mov INC_X, OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y, OLD_INC_Y +#else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - +#endif cmp N, #0 ble zdot_kernel_L999 @@ -280,8 +297,11 @@ zdot_kernel_L999: vadd.f64 d0 , d0, d2 vsub.f64 d1 , d1, d3 #endif +#if !defined(__ARM_PCS_VFP) + vstm RETURN_ADDR, {d0 - d1} +#endif - sub sp, fp, #24 + sub sp, fp, #28 pop {r4 - r9, fp} bx lr From e25f4c01d60b76ab97252e475abfb8fa7e65c0f9 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sat, 1 Jul 2017 19:57:28 +0530 Subject: [PATCH 21/42] arm: add softfp support in kernel/arm/nrm2_vfp*.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/KERNEL.ARMV7 | 10 +++++----- kernel/arm/nrm2_vfp.S | 7 +++++++ kernel/arm/nrm2_vfpv3.S | 9 +++++++-- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 022547c9b..be51e83b8 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -52,6 +52,11 @@ DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S ZDOTKERNEL = zdot_vfp.S +SNRM2KERNEL = nrm2_vfp.S +DNRM2KERNEL = nrm2_vfp.S +CNRM2KERNEL = nrm2_vfp.S +ZNRM2KERNEL = nrm2_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -91,11 +96,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SNRM2KERNEL = nrm2_vfp.S -DNRM2KERNEL = nrm2_vfp.S -CNRM2KERNEL = nrm2_vfp.S -ZNRM2KERNEL = nrm2_vfp.S - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d4829faa3..f4823b70a 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,5 +1,10 @@ include $(KERNELDIR)/KERNEL.ARMV6 +SNRM2KERNEL = nrm2_vfpv3.S +DNRM2KERNEL = nrm2_vfpv3.S +CNRM2KERNEL = nrm2_vfpv3.S +ZNRM2KERNEL = nrm2_vfpv3.S + STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_4x4.c @@ -17,11 +22,6 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SNRM2KERNEL = nrm2_vfpv3.S -DNRM2KERNEL = nrm2_vfpv3.S -CNRM2KERNEL = nrm2_vfpv3.S -ZNRM2KERNEL = nrm2_vfpv3.S - SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index b3bd28152..16ac5a632 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -573,6 +573,13 @@ nrm2_kernel_L999: #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 +#endif +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif #endif bx lr diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 7af966895..84977901d 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -503,8 +503,13 @@ nrm2_kernel_L999: #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 -#ifdef ARM_SOFTFP_ABI - vmov r0, s0 +#endif + +#if !defined(__ARM_PCS_VFP) +#if defined(DOUBLE) + vmov r0, r1, d0 +#else + vmov r0, s0 #endif #endif From 83bd547517e0a7df5b60ae1e6165c9d3528a07e4 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sat, 1 Jul 2017 20:37:40 +0530 Subject: [PATCH 22/42] arm: add softfp support in kernel/arm/swap_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/swap_vfp.S | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index be51e83b8..86d3dabaa 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -57,6 +57,11 @@ DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S ZNRM2KERNEL = nrm2_vfp.S +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -96,11 +101,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SSWAPKERNEL = swap_vfp.S -DSWAPKERNEL = swap_vfp.S -CSWAPKERNEL = swap_vfp.S -ZSWAPKERNEL = swap_vfp.S - SGEMVNKERNEL = gemv_n_vfp.S DGEMVNKERNEL = gemv_n_vfp.S CGEMVNKERNEL = cgemv_n_vfp.S diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S index 352875188..76661da79 100644 --- a/kernel/arm/swap_vfp.S +++ b/kernel/arm/swap_vfp.S @@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 +#if !defined(__ARM_PCS_VFP) + +#if !defined(COMPLEX) + +#if !defined(DOUBLE) +#define OLD_X [fp, #0 ] +#define OLD_INC_X [fp, #4 ] +#define OLD_Y [fp, #8 ] +#define OLD_INC_Y [fp, #12 ] +#else +#define OLD_X [fp, #8 ] +#define OLD_INC_X [fp, #12] +#define OLD_Y [fp, #16] +#define OLD_INC_Y [fp, #20] +#endif + +#else //COMPLEX + +#if !defined(DOUBLE) +#define OLD_X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define OLD_Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#else +#define OLD_X [fp, #16] +#define OLD_INC_X [fp, #20] +#define OLD_Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#endif // !defined(__ARM_PCS_VFP) + +#else #define OLD_INC_X [fp, #0 ] #define OLD_Y [fp, #4 ] #define OLD_INC_Y [fp, #8 ] +#endif #define N r0 @@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. push {r4 , fp} add fp, sp, #8 +#if !defined(__ARM_PCS_VFP) + ldr X, OLD_X +#endif ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y From ebf9e9dabe0c4f8208e8dd5c8a6579fa0045450e Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sat, 1 Jul 2017 11:16:12 -0700 Subject: [PATCH 23/42] arm64: Change mtune/mcpu options for THUNDERX2T99 target --- Makefile.arm64 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 7e9df2f4b..d19e796a5 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx endif ifeq ($(CORE), THUNDERX2T99) -CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan -FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan +CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 +FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 endif From 8f83d3f961f57fb002d8c5359c32a8db50dcab5d Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 00:38:44 +0530 Subject: [PATCH 24/42] arm: add softfp support in vfp gemv kernels --- kernel/arm/KERNEL.ARMV6 | 20 +++++++------- kernel/arm/KERNEL.ARMV7 | 6 ++-- kernel/arm/cgemv_n_vfp.S | 28 +++++++++++++++---- kernel/arm/cgemv_t_vfp.S | 28 +++++++++++++++---- kernel/arm/gemv_n_vfp.S | 44 +++++++++++++++++++++++++---- kernel/arm/gemv_n_vfpv3.S | 58 +++++++++++++++++++++++---------------- kernel/arm/gemv_t_vfp.S | 54 ++++++++++++++++++++++-------------- kernel/arm/gemv_t_vfpv3.S | 44 +++++++++++++++++++++++++---- kernel/arm/zgemv_n_vfp.S | 28 +++++++++++++++---- kernel/arm/zgemv_t_vfp.S | 28 +++++++++++++++---- 10 files changed, 252 insertions(+), 86 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 86d3dabaa..022a93183 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -62,6 +62,16 @@ DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S ZSWAPKERNEL = swap_vfp.S +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S + +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -101,16 +111,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SGEMVNKERNEL = gemv_n_vfp.S -DGEMVNKERNEL = gemv_n_vfp.S -CGEMVNKERNEL = cgemv_n_vfp.S -ZGEMVNKERNEL = zgemv_n_vfp.S - -SGEMVTKERNEL = gemv_t_vfp.S -DGEMVTKERNEL = gemv_t_vfp.S -CGEMVTKERNEL = cgemv_t_vfp.S -ZGEMVTKERNEL = zgemv_t_vfp.S - STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index f4823b70a..0872cb8cd 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -5,6 +5,9 @@ DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S +SGEMVNKERNEL = gemv_n_vfpv3.S +DGEMVNKERNEL = gemv_n_vfpv3.S + STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_4x4.c @@ -22,9 +25,6 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SGEMVNKERNEL = gemv_n_vfpv3.S -DGEMVNKERNEL = gemv_n_vfpv3.S - STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 5d2748644..4a1cd2d45 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_A_SOFTFP [fp, #4 ] +#define OLD_LDA [fp, #8 ] +#define X [fp, #12 ] +#define OLD_INC_X [fp, #16 ] +#define Y [fp, #20 ] +#define OLD_INC_Y [fp, #24 ] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_M r0 @@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble cgemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M vstr s0 , ALPHA_R diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index 76c8a8f18..e1c750c85 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_A_SOFTFP [fp, #4 ] +#define OLD_LDA [fp, #8 ] +#define X [fp, #12 ] +#define OLD_INC_X [fp, #16 ] +#define Y [fp, #20 ] +#define OLD_INC_Y [fp, #24 ] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_N r1 @@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble cgemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 385370b7f..7c154d741 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + #define OLD_A r3 #define OLD_M r0 @@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble gemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA +#endif + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index 93bf23e49..54f958b7b 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] -#define OLD_A r3 -#else -#define OLD_A_SOFTFP [fp, #0 ] -#define OLD_LDA [fp, #4 ] -#define X [fp, #8 ] -#define OLD_INC_X [fp, #12 ] -#define Y [fp, #16 ] -#define OLD_INC_Y [fp, #20 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) #define OLD_ALPHA r3 -#define OLD_A r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] #endif +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + +#define OLD_A r3 #define OLD_M r0 #define AO1 r0 @@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble gemvn_kernel_L999 -#ifndef DOUBLE -#ifdef ARM_SOFTFP_ABI - - vmov s0, OLD_ALPHA - ldr OLD_A, OLD_A_SOFTFP +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA #endif + ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A str OLD_M, M - - + ldr INC_X , OLD_INC_X ldr INC_Y , OLD_INC_Y diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S index 816be54ff..9559d1829 100644 --- a/kernel/arm/gemv_t_vfp.S +++ b/kernel/arm/gemv_t_vfp.S @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard abi -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] -#define OLD_A r3 -#else -#define OLD_A_SOFTFP [fp, #0 ] -#define OLD_LDA [fp, #4 ] -#define X [fp, #8 ] -#define OLD_INC_X [fp, #12 ] -#define Y [fp, #16 ] -#define OLD_INC_Y [fp, #20 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) #define OLD_ALPHA r3 -#define OLD_A r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] #endif +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + +#define OLD_A r3 #define OLD_N r1 #define M r0 @@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble gemvt_kernel_L999 -#ifndef DOUBLE -#ifdef ARM_SOFTFP_ABI - vmov s0, OLD_ALPHA - ldr OLD_A, OLD_A_SOFTFP +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA #endif + ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S index 7ae5799bc..b1d3dadf1 100644 --- a/kernel/arm/gemv_t_vfpv3.S +++ b/kernel/arm/gemv_t_vfpv3.S @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + #define OLD_A r3 #define OLD_N r1 @@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble gemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA +#endif + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index da9a91043..7d5567849 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR [fp, #0 ] +#define OLD_ALPHAI [fp, #8 ] +#define OLD_A_SOFTFP [fp, #16] +#define OLD_LDA [fp, #20] +#define X [fp, #24] +#define OLD_INC_X [fp, #28] +#define Y [fp, #32] +#define OLD_INC_Y [fp, #36] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_M r0 @@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble zgemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M vstr d0 , ALPHA_R diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 211fa0701..407026166 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR [fp, #0 ] +#define OLD_ALPHAI [fp, #8 ] +#define OLD_A_SOFTFP [fp, #16] +#define OLD_LDA [fp, #20] +#define X [fp, #24] +#define OLD_INC_X [fp, #28] +#define Y [fp, #32] +#define OLD_INC_Y [fp, #36] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_N r1 @@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble zgemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N From eda9e8632ab7d94d609006612a4b760214dfa847 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:00:48 +0530 Subject: [PATCH 25/42] generic: Bug fixes in generic 4x2 and 4x4 gemm kernels --- kernel/generic/gemmkernel_4x2.c | 30 +++++----- kernel/generic/gemmkernel_4x4.c | 98 ++++++++++++++++----------------- 2 files changed, 64 insertions(+), 64 deletions(-) diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c index 1d15de1d7..8c784e2f1 100644 --- a/kernel/generic/gemmkernel_4x2.c +++ b/kernel/generic/gemmkernel_4x2.c @@ -154,11 +154,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; res1_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; - C1[0] = res1_0; - C1[1] = res1_1; + C1[0] += res1_0; + C1[1] += res1_1; C0 = C0+2; C1 = C1+2; @@ -190,12 +190,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; - C1[0] = res1_0; + C1[0] += res1_0; - C0 = C0+1; - C1 = C1+1; + C0 += C0+1; + C1 += C1+1; } @@ -245,10 +245,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_2 *= alpha; res0_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; C0 = C0+4; @@ -278,8 +278,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_0 *= alpha; res0_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; C0 = C0+2; @@ -306,7 +306,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL C0[0] = res0_0; - C0 = C0+1; + C0 += C0+1; } k = (bk<<0); diff --git a/kernel/generic/gemmkernel_4x4.c b/kernel/generic/gemmkernel_4x4.c index bd67b3fc8..99bd9c1ef 100644 --- a/kernel/generic/gemmkernel_4x4.c +++ b/kernel/generic/gemmkernel_4x4.c @@ -152,25 +152,25 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res3_2 *= alpha; res3_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; - C1[0] = res1_0; - C1[1] = res1_1; - C1[2] = res1_2; - C1[3] = res1_3; + C1[0] += res1_0; + C1[1] += res1_1; + C1[2] += res1_2; + C1[3] += res1_3; - C2[0] = res2_0; - C2[1] = res2_1; - C2[2] = res2_2; - C2[3] = res2_3; + C2[0] += res2_0; + C2[1] += res2_1; + C2[2] += res2_2; + C2[3] += res2_3; - C3[0] = res3_0; - C3[1] = res3_1; - C3[2] = res3_2; - C3[3] = res3_3; + C3[0] += res3_0; + C3[1] += res3_1; + C3[2] += res3_2; + C3[3] += res3_3; C0 = C0+4; C1 = C1+4; @@ -230,17 +230,17 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res3_0 *= alpha; res3_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; - C1[0] = res1_0; - C1[1] = res1_1; + C1[0] += res1_0; + C1[1] += res1_1; - C2[0] = res2_0; - C2[1] = res2_1; + C2[0] += res2_0; + C2[1] += res2_1; - C3[0] = res3_0; - C3[1] = res3_1; + C3[0] += res3_0; + C3[1] += res3_1; C0 = C0+2; C1 = C1+2; @@ -283,13 +283,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res3_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; - C1[0] = res1_0; + C1[0] += res1_0; - C2[0] = res2_0; + C2[0] += res2_0; - C3[0] = res3_0; + C3[0] += res3_0; C0 = C0+1; C1 = C1+1; @@ -360,15 +360,15 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_2 *= alpha; res1_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; - C1[0] = res1_0; - C1[1] = res1_1; - C1[2] = res1_2; - C1[3] = res1_3; + C1[0] += res1_0; + C1[1] += res1_1; + C1[2] += res1_2; + C1[3] += res1_3; C0 = C0+4; C1 = C1+4; @@ -408,11 +408,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; res1_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; - C1[0] = res1_0; - C1[1] = res1_1; + C1[0] += res1_0; + C1[1] += res1_1; C0 = C0+2; C1 = C1+2; @@ -444,9 +444,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; - C1[0] = res1_0; + C1[0] += res1_0; C0 = C0+1; C1 = C1+1; @@ -499,10 +499,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_2 *= alpha; res0_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; C0 = C0+4; @@ -532,8 +532,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_0 *= alpha; res0_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; C0 = C0+2; @@ -558,7 +558,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; C0 = C0+1; From 872a11a2bfd90225d5ace725b0ec4f59bd9291f3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:05:48 +0530 Subject: [PATCH 26/42] arm: add softfp support in sgemm/strmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 4 ++-- kernel/arm/KERNEL.ARMV7 | 3 +-- kernel/arm/sgemm_kernel_4x2_vfp.S | 12 ++++++++++++ kernel/arm/sgemm_kernel_4x4_vfpv3.S | 29 +++++++++++------------------ kernel/arm/strmm_kernel_4x2_vfp.S | 13 +++++++++++++ kernel/arm/strmm_kernel_4x4_vfpv3.S | 13 +++++++++++++ 6 files changed, 52 insertions(+), 22 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 022a93183..18d9869de 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -73,6 +73,7 @@ CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S SGEMMKERNEL = ../generic/gemmkernel_4x2.c +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S SGEMMITCOPY = sgemm_tcopy_4_vfp.S @@ -97,6 +98,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o STRMMKERNEL = ../generic/trmmkernel_4x2.c +STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = ../generic/trmmkernel_4x2.c CGEMMONCOPY = cgemm_ncopy_2_vfp.S @@ -111,12 +113,10 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -SGEMMKERNEL = sgemm_kernel_4x2_vfp.S DGEMMKERNEL = dgemm_kernel_4x2_vfp.S CGEMMKERNEL = cgemm_kernel_2x2_vfp.S ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 0872cb8cd..e2044133d 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -11,7 +11,7 @@ DGEMVNKERNEL = gemv_n_vfpv3.S STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_4x4.c -SGEMMKERNEL = ../generic/gemmkernel_4x4.c +SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMOTCOPY = sgemm_tcopy_4_vfp.S SGEMMONCOPYOBJ = sgemm_oncopy.o @@ -30,7 +30,6 @@ DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index e8b44b742..1f21e5a1f 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 86198ac90..6491d3571 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 - -#ifdef ARM_SOFTFP_ABI -#define OLD_ALPHA r3 -//#define OLD_A -#else //hard #define OLD_A r3 #define OLD_ALPHA s0 -#endif /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] - -#ifndef ARM_SOFTFP_ABI #define A [fp, #-268 ] -#endif #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, #-240] @@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] -#ifdef ARM_SOFTFP_ABI -#define A [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] #define B [fp, #8 ] #define C [fp, #12 ] #define OLD_LDC [fp, #16 ] -#else //hard +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif - + #define I r0 #define J r1 #define L r2 @@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_M, M str OLD_N, N str OLD_K, K - -#ifdef ARM_SOFTFP_ABI - str OLD_ALPHA, ALPHA -#else //hard str OLD_A, A vstr OLD_ALPHA, ALPHA -#endif + sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 8f97644ec..635b1dd13 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define OLD_C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#define OFFSET [fp, #20 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index 0dd03ac85..e24d24eba 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#define OFFSET [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 09bc6ebe5b26aecd405a25dad2fa2934642fc827 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:24:38 +0530 Subject: [PATCH 27/42] arm: add softfp support in dgemm/dtrmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 8 ++------ kernel/arm/KERNEL.ARMV7 | 10 +++------- kernel/arm/dgemm_kernel_4x2_vfp.S | 13 ++++++++++++- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 12 ++++++++++++ kernel/arm/dtrmm_kernel_4x2_vfp.S | 13 +++++++++++++ kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 13 +++++++++++++ 6 files changed, 55 insertions(+), 14 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 18d9869de..622085b45 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -72,7 +72,6 @@ DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S -SGEMMKERNEL = ../generic/gemmkernel_4x2.c SGEMMKERNEL = sgemm_kernel_4x2_vfp.S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -85,7 +84,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_4x2.c +DGEMMKERNEL = dgemm_kernel_4x2_vfp.S ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = dgemm_ncopy_4_vfp.S DGEMMITCOPY = dgemm_tcopy_4_vfp.S @@ -97,9 +96,8 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -STRMMKERNEL = ../generic/trmmkernel_4x2.c STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = ../generic/trmmkernel_4x2.c +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S @@ -113,11 +111,9 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -DGEMMKERNEL = dgemm_kernel_4x2_vfp.S CGEMMKERNEL = cgemm_kernel_2x2_vfp.S ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index e2044133d..63c468e66 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -8,8 +8,8 @@ ZNRM2KERNEL = nrm2_vfpv3.S SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -STRMMKERNEL = ../generic/trmmkernel_4x4.c -DTRMMKERNEL = ../generic/trmmkernel_4x4.c +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S @@ -17,7 +17,7 @@ SGEMMOTCOPY = sgemm_tcopy_4_vfp.S SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_4x4.c +DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S DGEMMONCOPY = dgemm_ncopy_4_vfp.S DGEMMOTCOPY = dgemm_tcopy_4_vfp.S DGEMMONCOPYOBJ = dgemm_oncopy.o @@ -25,13 +25,9 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o ifeq ($(ARM_ABI),hard) -STRMMKERNEL = strmm_kernel_4x4_vfpv3.S -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S - CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S index 183269d1b..001a6050c 100644 --- a/kernel/arm/dgemm_kernel_4x2_vfp.S +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] - +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index b14052e06..1744b54d8 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S index c578d2b1e..3d6fbf8e9 100644 --- a/kernel/arm/dtrmm_kernel_4x2_vfp.S +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define OLD_C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#define OFFSET [fp, #28 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index c7e455f16..c0c6a1677 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define OLD_C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#define OFFSET [fp, #28 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 305cd2e8b41f4daccdfa1e6631bce7f7133faf92 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:42:32 +0530 Subject: [PATCH 28/42] arm: add softfp support in cgemm/ctrmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 4 ++-- kernel/arm/KERNEL.ARMV7 | 5 +++-- kernel/arm/cgemm_kernel_2x2_vfp.S | 14 ++++++++++++++ kernel/arm/cgemm_kernel_2x2_vfpv3.S | 14 ++++++++++++++ kernel/arm/ctrmm_kernel_2x2_vfp.S | 15 +++++++++++++++ kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 15 +++++++++++++++ 6 files changed, 63 insertions(+), 4 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 622085b45..e8fc3df73 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -98,7 +98,9 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +CGEMMKERNEL = cgemm_kernel_2x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o @@ -111,10 +113,8 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -CGEMMKERNEL = cgemm_kernel_2x2_vfp.S ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S endif diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 63c468e66..4bfe18d1d 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -10,6 +10,7 @@ DGEMVNKERNEL = gemv_n_vfpv3.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S @@ -23,12 +24,12 @@ DGEMMOTCOPY = dgemm_tcopy_4_vfp.S DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o +CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S + ifeq ($(ARM_ABI),hard) -CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S endif diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index f0517cb47..512eea387 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index cf132a184..42eb53a55 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index 8cb7ede9d..95578b10a 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#define OFFSET [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 97bd88c69..18beb4e47 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#define OFFSET [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 97d671eb610de8cd73fa90923bfbed87d1d8ffef Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:54:32 +0530 Subject: [PATCH 29/42] arm: add softfp support in zgemm/ztrmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 13 ++++--------- kernel/arm/KERNEL.ARMV7 | 15 +++++---------- kernel/arm/zgemm_kernel_2x2_vfp.S | 14 ++++++++++++++ kernel/arm/zgemm_kernel_2x2_vfpv3.S | 14 ++++++++++++++ kernel/arm/ztrmm_kernel_2x2_vfp.S | 15 +++++++++++++++ kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 15 +++++++++++++++ 6 files changed, 67 insertions(+), 19 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index e8fc3df73..960dae67b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -96,25 +96,20 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S - CGEMMKERNEL = cgemm_kernel_2x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o +ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S ZGEMMONCOPY = zgemm_ncopy_2_vfp.S ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o -ifeq ($(ARM_ABI),hard) - +STRMMKERNEL = strmm_kernel_4x2_vfp.S +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S - -endif diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 4bfe18d1d..5e0b4cfb8 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -8,10 +8,6 @@ ZNRM2KERNEL = nrm2_vfpv3.S SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -STRMMKERNEL = strmm_kernel_4x4_vfpv3.S -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S - SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMOTCOPY = sgemm_tcopy_4_vfp.S @@ -25,11 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S - -ifeq ($(ARM_ABI),hard) - -ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S - ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -endif +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S + diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 46507c4d2..618f09781 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 5a99f792f..0fe0c1993 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index dc80b17b8..78d09a9c7 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#define OFFSET [fp, #36 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 5a808ccbc..bf72ce605 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#define OFFSET [fp, #36 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 37efb5bc1d9b78e5e612b5aad896981d58a5d18f Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 03:06:36 +0530 Subject: [PATCH 30/42] arm: Remove unnecessary files/code Since softfp code has been added to all required vfp kernels, the code for auto detection of abi is no longer required. The option to force softfp ABI on make command line by giving ARM_SOFTFP_ABI=1 is retained. But there is no need to give this option anymore. Also the newly added C versions of 4x4/4x2 gemm/trmm kernels are removed. These are longer required. Moreover these kernels has bugs. --- Makefile.system | 19 +- c_check | 16 +- kernel/generic/gemmkernel_4x2.c | 317 ------------------ kernel/generic/gemmkernel_4x4.c | 571 -------------------------------- kernel/generic/trmmkernel_4x2.c | 528 ----------------------------- 5 files changed, 8 insertions(+), 1443 deletions(-) delete mode 100644 kernel/generic/gemmkernel_4x2.c delete mode 100644 kernel/generic/gemmkernel_4x4.c delete mode 100644 kernel/generic/trmmkernel_4x2.c diff --git a/Makefile.system b/Makefile.system index 2cae5f1c9..4face0e51 100644 --- a/Makefile.system +++ b/Makefile.system @@ -487,19 +487,14 @@ ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 -# If ABI is specified on command line use it. Else use the automatically detected ABI. -ifeq ($(ARM_SOFTFP_ABI),1) -ARM_ABI = softfp -else -ifeq ($(ARM_HARD_ABI),1) -ARM_ABI = hard -else -ARM_ABI=$(ARM_ABI_AUTO) +CCOMMON_OPT += -marm +FCOMMON_OPT += -marm + +# If softfp abi is mentioned on the command line, force it. +ifeq ($(ARM_SOFTFP_ABI), 1) +CCOMMON_OPT += -mfloat-abi=softfp +FCOMMON_OPT += -mfloat-abi=softfp endif -endif -export ARM_ABI_AUTO -CCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) -FCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) endif diff --git a/c_check b/c_check index 2e7e08cfb..20da288be 100644 --- a/c_check +++ b/c_check @@ -94,17 +94,7 @@ if ($architecture eq "mips64") { $defined = 1; } -if ($architecture eq "arm") { - $defined = 1; - $data = `$compiler_name -dM -E ctest2.c | grep -w __ARM_PCS_VFP`; - if ($data ne "") { - $abi = "hard"; - } else { - $abi = "softfp"; - } -} - -if ($architecture eq "arm64") { +if (($architecture eq "arm") || ($architecture eq "arm64")) { $defined = 1; } @@ -297,10 +287,6 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; -if ($architecture eq "arm") { - print MAKEFILE "ARM_ABI_AUTO=$abi\n"; -} - $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; $compiler =~ tr/[a-z]/[A-Z]/; diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c deleted file mode 100644 index 8c784e2f1..000000000 --- a/kernel/generic/gemmkernel_4x2.c +++ /dev/null @@ -1,317 +0,0 @@ -/*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - - for (j=0; j<(bn/2); j+=2) - { - C0 = C; - C1 = C0+ldc; - - ptrba = ba; - - for (i=0; i - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT res2_0; - FLOAT res2_1; - FLOAT res2_2; - FLOAT res2_3; - - FLOAT res3_0; - FLOAT res3_1; - FLOAT res3_2; - FLOAT res3_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - FLOAT b2; - FLOAT b3; - - - for (j=0; j - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - - BLASLONG off, temp; - - bool left; - bool transposed; - bool backwards; - -#ifdef LEFT - left = true; -#else - left = false; -#endif - -#ifdef TRANSA - transposed = true; -#else - transposed = false; -#endif - - backwards = left != transposed; - - if (!left) { - off = -offset; - } - - for (j=0; j<(bn/2); j+=2) // do the Mx2 loops - { - C0 = C; - C1 = C0+ldc; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - - ptrba = ba; - - for (i=0; i Date: Sun, 2 Jul 2017 00:50:14 +0200 Subject: [PATCH 31/42] Add files via upload --- Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 499732d16..107dff5f2 100644 --- a/Makefile +++ b/Makefile @@ -25,8 +25,8 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench -.PHONY : all libs netlib test ctest shared install -.NOTPARALLEL : all libs prof lapack-test install blas-test +.PHONY : all libs netlib $(RELA) test ctest shared install +.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test all :: libs netlib $(RELA) tests shared @echo @@ -339,6 +339,7 @@ endif @touch $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h + @$(MAKE) -C relapack clean @rm -f *.grd Makefile.conf_last config_last.h @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) @echo Done. From df2dfe65d6ba112e70ec6c1dce167298d2d5779b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Jul 2017 01:46:23 +0200 Subject: [PATCH 32/42] Update Makefile --- relapack/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relapack/Makefile b/relapack/Makefile index 1e81b5423..90d37f101 100644 --- a/relapack/Makefile +++ b/relapack/Makefile @@ -29,7 +29,7 @@ LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm libs: $(OBJS) @echo "Building ReLAPACK library $(LIBNAME)" - $(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS) + $(AR) -rv $(TOPDIR)/$(LIBNAME) $(OBJS) $(RANLIB) $(TOPDIR)/$(LIBNAME) %.o: %.c config.h From fa6a920caa09e60e24acfad7ad8acdf30e8a7f14 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 5 Jul 2017 17:01:03 +0800 Subject: [PATCH 33/42] Link -lm or -lm_hard for Android ARMv7. --- Makefile.system | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile.system b/Makefile.system index 29d3efd53..bb55dd693 100644 --- a/Makefile.system +++ b/Makefile.system @@ -493,6 +493,14 @@ else CCOMMON_OPT += -mfloat-abi=hard FCOMMON_OPT += -mfloat-abi=hard endif + +ifeq ($(OSNAME), Android) +ifeq ($(ARM_SOFTFP_ABI), 1) +EXTRALIB += -lm +else +EXTRALIB += -Wl,-lm_hard +endif +endif endif ifeq ($(ARCH), arm64) From 3381f2370966d61c333476136fa188567e958407 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Jul 2017 10:12:00 +0200 Subject: [PATCH 34/42] Handle different object extensions in Makefile The optimized LAPACK functions from interface use OS-dependent suffixes .o/.obj for the object files, while netlib LAPACK uses .o throughout. ReLAPACK object names have to match in order for function replacement in the growing library file to work. --- relapack/Makefile | 56 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/relapack/Makefile b/relapack/Makefile index 90d37f101..ddf101bd1 100644 --- a/relapack/Makefile +++ b/relapack/Makefile @@ -4,7 +4,38 @@ include $(TOPDIR)/Makefile.system SRC = $(wildcard src/*.c) -OBJS = $(SRC:%.c=%.o) + +SRC1 = \ + src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \ + src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \ + src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \ + src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c + +SRC2 = \ + src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ + src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ + src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ + src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ + src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ + src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/ztrsyl_rec2.c \ + src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c \ + src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \ + src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c + +SRCX = \ + src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ + src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ + src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ + src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ + src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ + src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \ + src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \ + src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \ + src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c + +OBJS1 = $(SRC1:%.c=%.$(SUFFIX)) +OBJS2 = $(SRC2:%.c=%.o) +OBJS = $(OBJS1) $(OBJS2) TEST_SUITS = \ slauum dlauum clauum zlauum \ @@ -29,9 +60,12 @@ LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm libs: $(OBJS) @echo "Building ReLAPACK library $(LIBNAME)" - $(AR) -rv $(TOPDIR)/$(LIBNAME) $(OBJS) + $(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS) $(RANLIB) $(TOPDIR)/$(LIBNAME) +%.$(SUFFIX): %.c config.h + $(CC) $(CFLAGS) -c $< -o $@ + %.o: %.c config.h $(CC) $(CFLAGS) -c $< -o $@ @@ -45,20 +79,20 @@ test/%.pass: test/%.x @echo -n $*: @./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<) -test/s%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h - $(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) +test/s%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) -test/d%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h - $(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) +test/d%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) -test/c%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h - $(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) +test/c%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) -test/z%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h - $(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) +test/z%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) # cleaning up clean: - rm -f $(OBJS) test/util.o test/*.x + rm -f $(OBJS) test/util.$(SUFFIX) test/*.x From 49e62c0e7796ccaa773591e739628846e3d8ab06 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Jul 2017 17:30:12 +0200 Subject: [PATCH 35/42] fixed syrk_thread.c taken from wernsaar Stride calculation fix copied from https://github.com/wernsaar/OpenBLAS/commit/88900e1 --- driver/level3/syrk_thread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 94274be72..5f40853dc 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)i; - width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask; + width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; @@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)(arg -> n - i); - width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask; + width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; From f02d535fdebd541b0fc21a58c8b370c744555531 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 7 Jul 2017 12:30:42 +0530 Subject: [PATCH 36/42] arm: Fix clang compilation for ARMv7 clang is not recognizing some pre-UAL VFP mnemonics like fnmacs, fnmacd, fnmuls and fnmuld. Replaced them with equivalent UAL mnemonics which are vmls.f32, vmls.f64, vnmul.f32 and vnmul.f64 respectively. --- kernel/arm/axpy_vfp.S | 8 ++-- kernel/arm/cgemm_kernel_2x2_vfp.S | 16 +++---- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 16 +++---- kernel/arm/cgemv_n_vfp.S | 16 +++---- kernel/arm/cgemv_t_vfp.S | 16 +++---- kernel/arm/ctrmm_kernel_2x2_vfp.S | 16 +++---- kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 16 +++---- kernel/arm/rot_vfp.S | 72 ++++++++++++++--------------- kernel/arm/scal_vfp.S | 24 +++++----- kernel/arm/zgemm_kernel_2x2_vfp.S | 16 +++---- kernel/arm/zgemm_kernel_2x2_vfpv3.S | 16 +++---- kernel/arm/zgemv_n_vfp.S | 16 +++---- kernel/arm/zgemv_t_vfp.S | 16 +++---- kernel/arm/ztrmm_kernel_2x2_vfp.S | 16 +++---- kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 16 +++---- 15 files changed, 148 insertions(+), 148 deletions(-) diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index 8e5334f62..4040c7da2 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -71,14 +71,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define FMAC_R1 fmacd -#define FMAC_R2 fnmacd +#define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #else #define FMAC_R1 fmacs -#define FMAC_R2 fnmacs +#define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs @@ -90,14 +90,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacd #define FMAC_R2 fmacd -#define FMAC_I1 fnmacd +#define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else #define FMAC_R1 fmacs #define FMAC_R2 fmacs -#define FMAC_I1 fnmacs +#define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index f0517cb47..639b713cd 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -94,42 +94,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CN) || defined(CT) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index cf132a184..16c00ad73 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -106,10 +106,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmacs - #define FMAC_R2 fnmacs + #define FMAC_R1 vmls.f32 + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs - #define FMAC_I2 fnmacs + #define FMAC_I2 vmls.f32 #elif defined(CN) || defined(CT) @@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) @@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs @@ -136,10 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmacs + #define FMAC_R1 vmls.f32 #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs - #define FMAC_I2 fnmacs + #define FMAC_I1 vmls.f32 + #define FMAC_I2 vmls.f32 #endif diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 5d2748644..a9040e76e 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -78,42 +78,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index 76c8a8f18..56451c5df 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -76,42 +76,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index 8cb7ede9d..50798449b 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -98,42 +98,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CN) || defined(CT) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 97bd88c69..ef7e58fa4 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -93,10 +93,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmuls - #define FMAC_R2 fnmacs + #define FMAC_R1 vnmul.f32 + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmuls - #define FMAC_I2 fnmacs + #define FMAC_I2 vmls.f32 #elif defined(CN) || defined(CT) @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmuls #define FMAC_R2 fmacs - #define FMAC_I1 fnmuls + #define FMAC_I1 vnmul.f32 #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) @@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubs #define FMAC_R1 fmuls - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmuls #define FMAC_I2 fmacs @@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmuls + #define FMAC_R1 vnmul.f32 #define FMAC_R2 fmacs - #define FMAC_I1 fnmuls - #define FMAC_I2 fnmacs + #define FMAC_I1 vnmul.f32 + #define FMAC_I2 vmls.f32 #endif diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index d053423b6..0d1067cf9 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -100,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X, { d2 } fstmiad Y, { d3 } @@ -145,7 +145,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -154,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -163,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -172,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -186,7 +186,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X, { s2 } fstmias Y, { s3 } @@ -226,13 +226,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -241,13 +241,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -259,13 +259,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -274,13 +274,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -294,13 +294,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -314,13 +314,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 vstr d2 , [ X, #0 ] vstr d3 , [ Y, #0 ] vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 vstr d2 , [ X, #8 ] vstr d3 , [ Y, #8 ] @@ -343,13 +343,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -358,13 +358,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -376,13 +376,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -391,13 +391,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -411,13 +411,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -431,13 +431,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 vstr s2 , [ X, #0 ] vstr s3 , [ Y, #0 ] vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 vstr s2 , [ X, #4 ] vstr s3 , [ Y, #4 ] diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S index a8939c3a2..cc3e3b98d 100644 --- a/kernel/arm/scal_vfp.S +++ b/kernel/arm/scal_vfp.S @@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X, { d2 - d3 } @@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } @@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } @@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X, { s2 - s3 } diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 46507c4d2..6aeb6c790 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -87,42 +87,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CN) || defined(CT) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 5a99f792f..10c83e356 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -106,10 +106,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmacd - #define FMAC_R2 fnmacd + #define FMAC_R1 vmls.f64 + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd - #define FMAC_I2 fnmacd + #define FMAC_I2 vmls.f64 #elif defined(CN) || defined(CT) @@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) @@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd @@ -136,10 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmacd + #define FMAC_R1 vmls.f64 #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd - #define FMAC_I2 fnmacd + #define FMAC_I1 vmls.f64 + #define FMAC_I2 vmls.f64 #endif diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index da9a91043..cba59567d 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -79,42 +79,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 211fa0701..b11c08086 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -77,42 +77,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index dc80b17b8..f412dfcfa 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -96,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CN) || defined(CT) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 5a808ccbc..92370bbc1 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -93,10 +93,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmuld - #define FMAC_R2 fnmacd + #define FMAC_R1 vnmul.f64 + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmuld - #define FMAC_I2 fnmacd + #define FMAC_I2 vmls.f64 #elif defined(CN) || defined(CT) @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmuld #define FMAC_R2 fmacd - #define FMAC_I1 fnmuld + #define FMAC_I1 vnmul.f64 #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) @@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubd #define FMAC_R1 fmuld - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmuld #define FMAC_I2 fmacd @@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmuld + #define FMAC_R1 vnmul.f64 #define FMAC_R2 fmacd - #define FMAC_I1 fnmuld - #define FMAC_I2 fnmacd + #define FMAC_I1 vnmul.f64 + #define FMAC_I2 vmls.f64 #endif From c1cf62d2c030b65d8fbf37b19ca1d88fa38f7709 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Jul 2017 09:45:38 +0200 Subject: [PATCH 37/42] Add sched_getcpu implementation for pre-2.6 glibc Fixes #1210, compilation on RHEL5 with affinity enabled --- driver/others/init.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/driver/others/init.c b/driver/others/init.c index 9be6f52b0..3e6176967 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -354,6 +354,24 @@ static int numa_check(void) { return common -> num_nodes; } +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 6) +int sched_getcpu(void) +{ +int cpu; +FILE *fp = NULL; +if ( (fp = fopen("/proc/self/stat", "r")) == NULL) + return -1; +if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) { + fclose (fp); + return -1; + } + fclose (fp); + return(cpu); +} +#endif +#endif + static void numa_mapping(void) { int node, cpu, core; @@ -808,7 +826,6 @@ void gotoblas_affinity_init(void) { common -> shmid = pshmid; if (common -> magic != SH_MAGIC) { - #ifdef DEBUG fprintf(stderr, "Shared Memory Initialization.\n"); #endif @@ -830,7 +847,7 @@ void gotoblas_affinity_init(void) { if (common -> num_nodes > 1) numa_mapping(); common -> final_num_procs = 0; - for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. + for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; From ad2462811a4093153ea9898200ab73ef4aea6f23 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Jul 2017 13:15:24 +0200 Subject: [PATCH 38/42] Do not add -lpthread on Android builds (#1229) * Do not add -lpthread on Android builds * Do not add -lpthread on Android cmake builds --- cmake/os.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/os.cmake b/cmake/os.cmake index f5a75027c..e9df68d7f 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -77,7 +77,7 @@ if (CYGWIN) set(NO_EXPRECISION 1) endif () -if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android") if (SMP) set(EXTRALIB "${EXTRALIB} -lpthread") endif () From 4a012c3d208f7e2a1df9303a50c884970217a259 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Jul 2017 15:39:15 +0200 Subject: [PATCH 39/42] Fix unintentional fall-through cases in get_cacheinfo These appear to be unintended side effects of PR #1091, probably causing #1232 --- cpuid_x86.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index ab2ecdcaf..3733ea3ac 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -637,12 +637,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ LD1.linesize = 64; break; case 0x63 : - DTB.size = 2048; - DTB.associative = 4; - DTB.linesize = 32; - LDTB.size = 4096; - LDTB.associative= 4; - LDTB.linesize = 32; + DTB.size = 2048; + DTB.associative = 4; + DTB.linesize = 32; + LDTB.size = 4096; + LDTB.associative= 4; + LDTB.linesize = 32; + break; case 0x66 : LD1.size = 8; LD1.associative = 4; @@ -675,12 +676,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ LC1.associative = 8; break; case 0x76 : - ITB.size = 2048; - ITB.associative = 0; - ITB.linesize = 8; - LITB.size = 4096; - LITB.associative= 0; - LITB.linesize = 8; + ITB.size = 2048; + ITB.associative = 0; + ITB.linesize = 8; + LITB.size = 4096; + LITB.associative= 0; + LITB.linesize = 8; + break; case 0x77 : LC1.size = 16; LC1.associative = 4; From 529bfc36ec444223ba7b49717ef2d7fa12445159 Mon Sep 17 00:00:00 2001 From: Andrew Date: Wed, 12 Jul 2017 00:59:30 +0200 Subject: [PATCH 40/42] Fix write past fixed size buffer --- driver/level2/gbmv_thread.c | 2 +- driver/level2/sbmv_thread.c | 2 +- driver/level2/spmv_thread.c | 2 +- driver/level2/tbmv_thread.c | 2 +- driver/level2/tpmv_thread.c | 2 +- driver/level2/trmv_thread.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index ef9d58d76..e86b565f8 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; - BLASLONG range_m[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index a0377d638..5718c0ec9 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x #endif blas_arg_t args; - blas_queue_t queue[MAX_CPU_NUMBER]; + blas_queue_t queue[MAX_CPU_NUMBER + 1]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER]; diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c index f8ae3cdcd..035300841 100644 --- a/driver/level2/spmv_thread.c +++ b/driver/level2/spmv_thread.c @@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index bbb1c50eb..226a922e9 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c index 47dc1daf9..c91b52775 100644 --- a/driver/level2/tpmv_thread.c +++ b/driver/level2/tpmv_thread.c @@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 42edb83cb..0a155366c 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; From d33fc32cf30cf1262030c93fa44c72ca8ab27681 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Jul 2017 09:35:11 +0200 Subject: [PATCH 41/42] Revert "Fix unintentional fall-through cases in get_cacheinfo" --- cpuid_x86.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 3733ea3ac..ab2ecdcaf 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -637,13 +637,12 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ LD1.linesize = 64; break; case 0x63 : - DTB.size = 2048; - DTB.associative = 4; - DTB.linesize = 32; - LDTB.size = 4096; - LDTB.associative= 4; - LDTB.linesize = 32; - break; + DTB.size = 2048; + DTB.associative = 4; + DTB.linesize = 32; + LDTB.size = 4096; + LDTB.associative= 4; + LDTB.linesize = 32; case 0x66 : LD1.size = 8; LD1.associative = 4; @@ -676,13 +675,12 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ LC1.associative = 8; break; case 0x76 : - ITB.size = 2048; - ITB.associative = 0; - ITB.linesize = 8; - LITB.size = 4096; - LITB.associative= 0; - LITB.linesize = 8; - break; + ITB.size = 2048; + ITB.associative = 0; + ITB.linesize = 8; + LITB.size = 4096; + LITB.associative= 0; + LITB.linesize = 8; case 0x77 : LC1.size = 16; LC1.associative = 4; From 31e086d6a66658e7b04390e69884fe569e0a1e9d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Jul 2017 22:01:47 +0200 Subject: [PATCH 42/42] Disable ReLAPACK by default (#1238) * Disable ReLAPACK by default; mention it in final build message if included * Add files via upload * Add files via upload * Add files via upload --- Makefile | 2 +- Makefile.rule | 2 +- Makefile.system | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 107dff5f2..1b9bcb118 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ SUBDIRS += lapack endif RELA = -ifneq ($(BUILD_RELAPACK), 0) +ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif diff --git a/Makefile.rule b/Makefile.rule index 2866699be..8d8aecdc9 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -84,7 +84,7 @@ VERSION = 0.2.20.dev BUILD_LAPACK_DEPRECATED = 1 # Build RecursiveLAPACK on top of LAPACK -BUILD_RELAPACK = 1 +# BUILD_RELAPACK = 1 # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 diff --git a/Makefile.system b/Makefile.system index c4cf619d0..bd361a1a2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1129,6 +1129,9 @@ LIB_COMPONENTS += LAPACK ifneq ($(NO_LAPACKE), 1) LIB_COMPONENTS += LAPACKE endif +ifeq ($(BUILD_RELAPACK), 1) +LIB_COMPONENTS += ReLAPACK +endif endif ifeq ($(ONLY_CBLAS), 1)