Add Elmar Peise's ReLAPACK

This commit is contained in:
Martin Kroeker
2017-06-28 17:38:41 +02:00
committed by GitHub
parent 482015f8d6
commit 9b7b5f7fdc
82 changed files with 20579 additions and 0 deletions

61
relapack/src/blas.h Normal file
View File

@@ -0,0 +1,61 @@
#ifndef BLAS_H
#define BLAS_H
extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *);
extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *);
extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *);
extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *);
extern void BLAS(sscal)(const int *, const float *, float *, const int *);
extern void BLAS(dscal)(const int *, const double *, double *, const int *);
extern void BLAS(cscal)(const int *, const float *, float *, const int *);
extern void BLAS(zscal)(const int *, const double *, double *, const int *);
extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
#if HAVE_XGEMMT
extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
#endif
#endif /* BLAS_H */

230
relapack/src/cgbtrf.c Normal file
View File

@@ -0,0 +1,230 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *,
const int *, float *, const int *, int *, float *, const int *, float *,
const int *, int *);
/** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's cgbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html
* */
void RELAPACK_cgbtrf(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kl < 0)
*info = -3;
else if (*ku < 0)
*info = -4;
else if (*ldAb < 2 * *kl + *ku + 1)
*info = -6;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CGBTRF", &minfo);
return;
}
// Constant
const float ZERO[] = { 0., 0. };
// Result upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + 2 * kv;
// Zero upper diagonal fill-in elements
int i, j;
for (j = 0; j < *n; j++) {
float *const A_j = A + 2 * *ldA * j;
for (i = MAX(0, j - kv); i < j - *ku; i++)
A_j[2 * i] = A_j[2 * i + 1] = 0.;
}
// Allocate work space
const int n1 = CREC_SPLIT(*n);
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const int nWorkl = (kv > n1) ? n1 : kv;
const int mWorku = (*kl > n1) ? n1 : *kl;
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float));
float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float));
LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
// Free work space
free(Workl);
free(Worku);
}
/** cgbtrf's recursive compute kernel */
static void RELAPACK_cgbtrf_rec(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
int *info
) {
if (*n <= MAX(CROSSOVER_CGBTRF, 1)) {
// Unblocked
LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Loop iterators
int i, j;
// Output upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + 2 * kv;
// Splitting
const int n1 = MIN(CREC_SPLIT(*n), *kl);
const int n2 = *n - n1;
const int m1 = MIN(n1, *m);
const int m2 = *m - m1;
const int mn1 = MIN(m1, n1);
const int mn2 = MIN(m2, n2);
// Ab_L *
// Ab_BR
float *const Ab_L = Ab;
float *const Ab_BR = Ab + 2 * *ldAb * n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + 2 * *ldA * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * m1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * m1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// Banded splitting
const int n21 = MIN(n2, kv - n1);
const int n22 = MIN(n2 - n21, n1);
const int m21 = MIN(m2, *kl - m1);
const int m22 = MIN(m2 - m21, m1);
// n1 n21 n22
// m * A_Rl ARr
float *const A_Rl = A_R;
float *const A_Rr = A_R + 2 * *ldA * n21;
// n1 n21 n22
// m1 * A_TRl A_TRr
// m21 A_BLt A_BRtl A_BRtr
// m22 A_BLb A_BRbl A_BRbr
float *const A_TRl = A_TR;
float *const A_TRr = A_TR + 2 * *ldA * n21;
float *const A_BLt = A_BL;
float *const A_BLb = A_BL + 2 * m21;
float *const A_BRtl = A_BR;
float *const A_BRtr = A_BR + 2 * *ldA * n21;
float *const A_BRbl = A_BR + 2 * m21;
float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
// Workl = A_BLb
LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
// partially redo swaps in A_L
for (i = 0; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
else
BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
}
}
// apply pivots to A_Rl
LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
// apply pivots to A_Rr columnwise
for (j = 0; j < n22; j++) {
float *const A_Rrj = A_Rr + 2 * *ldA * j;
for (i = j; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
const float tmpr = A_Rrj[2 * i];
const float tmpc = A_Rrj[2 * i + 1];
A_Rrj[2 * i] = A_Rrj[2 * ip];
A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1];
A_Rrj[2 * ip] = tmpr;
A_Rrj[2 * ip + 1] = tmpc;
}
}
}
// A_TRl = A_TL \ A_TRl
BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// Worku = A_TRr
LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
// Worku = A_TL \ Worku
BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
// A_TRr = Worku
LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
// A_BRtl = A_BRtl - A_BLt * A_TRl
BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
// A_BRbl = A_BRbl - Workl * A_TRl
BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
// A_BRtr = A_BRtr - A_BLt * Worku
BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Workl * Worku
BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
// partially undo swaps in A_L
for (i = mn1 - 1; i >= 0; i--) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
else
BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
if (*info)
*info += n1;
// shift pivots
for (i = 0; i < mn2; i++)
ipiv_B[i] += n1;
}

167
relapack/src/cgemmt.c Normal file
View File

@@ -0,0 +1,167 @@
#include "relapack.h"
static void RELAPACK_cgemmt_rec(const char *, const char *, const char *,
const int *, const int *, const float *, const float *, const int *,
const float *, const int *, const float *, float *, const int *);
static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *,
const int *, const int *, const float *, const float *, const int *,
const float *, const int *, const float *, float *, const int *);
/** CGEMMT computes a matrix-matrix product with general matrices but updates
* only the upper or lower triangular part of the result matrix.
*
* This routine performs the same operation as the BLAS routine
* cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
* but only updates the triangular part of C specified by uplo:
* If (*uplo == 'L'), only the lower triangular part of C is updated,
* otherwise the upper triangular part is updated.
* */
void RELAPACK_cgemmt(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
#if HAVE_XGEMMT
BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
#else
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int notransA = LAPACK(lsame)(transA, "N");
const int tranA = LAPACK(lsame)(transA, "T");
const int ctransA = LAPACK(lsame)(transA, "C");
const int notransB = LAPACK(lsame)(transB, "N");
const int tranB = LAPACK(lsame)(transB, "T");
const int ctransB = LAPACK(lsame)(transB, "C");
int info = 0;
if (!lower && !upper)
info = 1;
else if (!tranA && !ctransA && !notransA)
info = 2;
else if (!tranB && !ctransB && !notransB)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*ldA < MAX(1, notransA ? *n : *k))
info = 8;
else if (*ldB < MAX(1, notransB ? *k : *n))
info = 10;
else if (*ldC < MAX(1, *n))
info = 13;
if (info) {
LAPACK(xerbla)("CGEMMT", &info);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');
// Recursive kernel
RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
#endif
}
/** cgemmt's recursive compute kernel */
static void RELAPACK_cgemmt_rec(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
if (*n <= MAX(CROSSOVER_CGEMMT, 1)) {
// Unblocked
RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
}
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_T
// A_B
const float *const A_T = A;
const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);
// B_L B_R
const float *const B_L = B;
const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);
// C_TL C_TR
// C_BL C_BR
float *const C_TL = C;
float *const C_TR = C + 2 * *ldC * n1;
float *const C_BL = C + 2 * n1;
float *const C_BR = C + 2 * *ldC * n1 + 2 * n1;
// recursion(C_TL)
RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
if (*uplo == 'L')
// C_BL = alpha A_B B_L + beta C_BL
BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
else
// C_TR = alpha A_T B_R + beta C_TR
BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
// recursion(C_BR)
RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
}
/** cgemmt's unblocked compute kernel */
static void RELAPACK_cgemmt_rec2(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
const int incB = (*transB == 'N') ? 1 : *ldB;
const int incC = 1;
int i;
for (i = 0; i < *n; i++) {
// A_0
// A_i
const float *const A_0 = A;
const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);
// * B_i *
const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);
// * C_0i *
// * C_ii *
float *const C_0i = C + 2 * *ldC * i;
float *const C_ii = C + 2 * *ldC * i + 2 * i;
if (*uplo == 'L') {
const int nmi = *n - i;
if (*transA == 'N')
BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
else
BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
} else {
const int ip1 = i + 1;
if (*transA == 'N')
BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
else
BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
}
}
}

117
relapack/src/cgetrf.c Normal file
View File

@@ -0,0 +1,117 @@
#include "relapack.h"
static void RELAPACK_cgetrf_rec(const int *, const int *, float *,
const int *, int *, int *);
/** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's cgetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html
*/
void RELAPACK_cgetrf(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CGETRF", &minfo);
return;
}
const int sn = MIN(*m, *n);
RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
if (*m < *n) {
// Constants
const float ONE[] = { 1., 0. };
const int iONE[] = { 1 };
// Splitting
const int rn = *n - *m;
// A_L A_R
const float *const A_L = A;
float *const A_R = A + 2 * *ldA * *m;
// A_R = apply(ipiv, A_R)
LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
// A_R = A_L \ A_R
BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
}
}
/** cgetrf's recursive compute kernel */
static void RELAPACK_cgetrf_rec(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
if (*n <= MAX(CROSSOVER_CGETRF, 1)) {
// Unblocked
LAPACK(cgetf2)(m, n, A, ldA, ipiv, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
const int m2 = *m - n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + 2 * *ldA * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// recursion(A_L, ipiv_T)
RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
// apply pivots to A_R
LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
// A_TR = A_TL \ A_TR
BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_BL * A_TR
BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
// recursion(A_BR, ipiv_B)
RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
if (*info)
*info += n1;
// apply pivots to A_BL
LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
// shift pivots
int i;
for (i = 0; i < n2; i++)
ipiv_B[i] += n1;
}

212
relapack/src/chegst.c Normal file
View File

@@ -0,0 +1,212 @@
#include "relapack.h"
#if XSYGST_ALLOW_MALLOC
#include "stdlib.h"
#endif
static void RELAPACK_chegst_rec(const int *, const char *, const int *,
float *, const int *, const float *, const int *,
float *, const int *, int *);
/** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
*
* This routine is functionally equivalent to LAPACK's chegst.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html
* */
void RELAPACK_chegst(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (*itype < 1 || *itype > 3)
*info = -1;
else if (!lower && !upper)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
else if (*ldB < MAX(1, *n))
*info = -7;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CHEGST", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Allocate work space
float *Work = NULL;
int lWork = 0;
#if XSYGST_ALLOW_MALLOC
const int n1 = CREC_SPLIT(*n);
lWork = n1 * (*n - n1);
Work = malloc(lWork * 2 * sizeof(float));
if (!Work)
lWork = 0;
#endif
// recursive kernel
RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
// Free work space
#if XSYGST_ALLOW_MALLOC
if (Work)
free(Work);
#endif
}
/** chegst's recursive compute kernel */
static void RELAPACK_chegst_rec(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
float *Work, const int *lWork, int *info
) {
if (*n <= MAX(CROSSOVER_CHEGST, 1)) {
// Unblocked
LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info);
return;
}
// Constants
const float ZERO[] = { 0., 0. };
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const float HALF[] = { .5, 0. };
const float MHALF[] = { -.5, 0. };
const int iONE[] = { 1 };
// Loop iterator
int i;
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// B_TL B_TR
// B_BL B_BR
const float *const B_TL = B;
const float *const B_TR = B + 2 * *ldB * n1;
const float *const B_BL = B + 2 * n1;
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// recursion(A_TL, B_TL)
RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
if (*itype == 1)
if (*uplo == 'L') {
// A_BL = A_BL / B_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * B_BL * A_TL
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
} else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR \ A_BL
BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL' \ A_TR
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * A_TL * B_TR
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
} else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR / B_BR
BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
else
if (*uplo == 'L') {
// A_BL = A_BL * B_TL
BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * A_BR * B_BL
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
} else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR * A_BL
BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL * A_TR
BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * B_TR * A_BR
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
} else
// A_TR = A_TR + 1/2 B_TR A_BR
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
else
// A_TR = A_TR + 1/2 B_TR * A_BR
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR * B_BR
BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
// recursion(A_BR, B_BR)
RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
}

236
relapack/src/chetrf.c Normal file
View File

@@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's chetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html
* */
void RELAPACK_chetrf(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CHETRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** chetrf's recursive compute kernel */
static void RELAPACK_chetrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
// Unblocked
if (top) {
LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

520
relapack/src/chetrf_rec2.c Normal file
View File

@@ -0,0 +1,520 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
*
* This routine is a minor modification of LAPACK's clahef.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int *
nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2, r__3, r__4;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k;
static float t, r1;
static complex d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static float alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen), ccopy_(int *, complex *, int *,
complex *, int *), cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float absakk;
extern /* Subroutine */ int clacgv_(int *, complex *, int *);
extern int icamax_(int *, complex *, int *);
extern /* Subroutine */ int csscal_(int *, float *, complex *, int
*);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
i__1 = k - 1;
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k + kw * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
i__1 = k + kw * w_dim1;
i__2 = k + kw * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - 1;
ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
i__1 = k - imax;
clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + (kw - 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (kw - 1) * w_dim1]), dabs(r__2));
if (imax > 1) {
i__1 = imax - 1;
jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
i__1 = jmax + (kw - 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (kw - 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kk - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
i__1 = kk - 1 - kp;
clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
r1 = 1.f / a[i__1].r;
i__1 = k - 1;
csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + kw * w_dim1], &q__2);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
q__2.r = t, q__2.i = 0.f;
c_div(&q__1, &q__2, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
r_cnjg(&q__2, &d21);
i__3 = j + kw * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
q__2.r * q__3.i + q__2.i * q__3.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k - 2;
clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
i__1 = k + k * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
w_dim1], &c__1);
}
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
i__1 = k + k * w_dim1;
i__2 = k + k * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = imax - k;
clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (imax < *n) {
i__1 = *n - imax;
ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
imax + 1 + (k + 1) * w_dim1], &c__1);
}
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + (k + 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (k + 1) * w_dim1]), dabs(r__2));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
i__1 = jmax + (k + 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (k + 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
k * w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k + kstep - 1;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kp - kk - 1;
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
i__1 = kp - kk - 1;
clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
r1 = 1.f / a[i__1].r;
i__1 = *n - k;
csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + k * w_dim1], &q__2);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
q__2.r = t, q__2.i = 0.f;
c_div(&q__1, &q__2, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
r_cnjg(&q__2, &d21);
i__3 = j + k * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
q__2.r * q__3.i + q__2.i * q__3.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = *n - k - 1;
clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/chetrf_rook.c Normal file
View File

@@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's chetrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html
* */
void RELAPACK_chetrf_rook(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CHETRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** chetrf_rook's recursive compute kernel */
static void RELAPACK_chetrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
// Unblocked
if (top) {
LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@@ -0,0 +1,661 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
*
* This routine is a minor modification of LAPACK's clahef_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, complex *a, int *lda, int *ipiv,
complex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2;
complex q__1, q__2, q__3, q__4, q__5;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k, p;
static float t, r1;
static complex d11, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static float alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen);
static float sfmin;
extern /* Subroutine */ int ccopy_(int *, complex *, int *,
complex *, int *);
static int itemp;
extern /* Subroutine */ int cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float stemp, absakk;
extern /* Subroutine */ int clacgv_(int *, complex *, int *);
extern int icamax_(int *, complex *, int *);
extern double slamch_(char *, ftnlen);
extern /* Subroutine */ int csscal_(int *, float *, complex *, int
*);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
sfmin = slamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
if (k > 1) {
i__1 = k - 1;
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
c__1);
}
i__1 = k + kw * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
i__1 = k + kw * w_dim1;
i__2 = k + kw * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
r__1 = w[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
if (k > 1) {
i__1 = k - 1;
ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
&c__1);
}
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
if (imax > 1) {
i__1 = imax - 1;
ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
}
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
i__1 = k - imax;
clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + (kw - 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
i__1 = itemp + (kw - 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (kw - 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = p + p * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = k - 1 - p;
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
i__1 = k - 1 - p;
clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
if (p > 1) {
i__1 = p - 1;
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kk - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
i__1 = kk - 1 - kp;
clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
t = a[i__1].r;
if (dabs(t) >= sfmin) {
r1 = 1.f / t;
i__1 = k - 1;
csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
i__3 = ii + k * a_dim1;
q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L14: */
}
}
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + kw * w_dim1], &q__2);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
r_cnjg(&q__5, &d21);
c_div(&q__2, &q__3, &q__5);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k - 2;
clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
++jj;
if (kstep == 2 && jp1 != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = k + k * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
w_dim1], &c__1);
}
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
i__1 = k + k * w_dim1;
i__2 = k + k * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
r__1 = w[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
a_dim1], &c__1);
}
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = imax - k;
clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (imax < *n) {
i__1 = *n - imax;
ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
imax + 1 + (k + 1) * w_dim1], &c__1);
}
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
1) * w_dim1], &c__1, (ftnlen)12);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + (k + 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (k + 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
i__1 = itemp + (k + 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (k + 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (k + 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p + p * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = p - k - 1;
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
a_dim1], lda);
i__1 = p - k - 1;
clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
if (p < *n) {
i__1 = *n - p;
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
* a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
}
cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kp - kk - 1;
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
i__1 = kp - kk - 1;
clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
t = a[i__1].r;
if (dabs(t) >= sfmin) {
r1 = 1.f / t;
i__1 = *n - k;
csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
i__3 = ii + k * a_dim1;
q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L74: */
}
}
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + k * w_dim1], &q__2);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
r_cnjg(&q__5, &d21);
c_div(&q__2, &q__3, &q__5);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = *n - k - 1;
clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
--jj;
if (kstep == 2 && jp1 != jj && j >= 1) {
cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

87
relapack/src/clauum.c Normal file
View File

@@ -0,0 +1,87 @@
#include "relapack.h"
static void RELAPACK_clauum_rec(const char *, const int *, float *,
const int *, int *);
/** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
*
* This routine is functionally equivalent to LAPACK's clauum.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html
* */
void RELAPACK_clauum(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CLAUUM", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info);
}
/** clauum's recursive compute kernel */
static void RELAPACK_clauum_rec(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_CLAUUM, 1)) {
// Unblocked
LAPACK(clauu2)(uplo, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info);
if (*uplo == 'L') {
// A_TL = A_TL + A_BL' * A_BL
BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
// A_BL = A_BR' * A_BL
BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TL = A_TL + A_TR * A_TR'
BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
// A_TR = A_TR * A_BR'
BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info);
}

157
relapack/src/cpbtrf.c Normal file
View File

@@ -0,0 +1,157 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *,
float *, const int *, float *, const int *, int *);
/** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
*
* This routine is functionally equivalent to LAPACK's cpbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html
* */
void RELAPACK_cpbtrf(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kd < 0)
*info = -3;
else if (*ldAb < *kd + 1)
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CPBTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Constant
const float ZERO[] = { 0., 0. };
// Allocate work space
const int n1 = CREC_SPLIT(*n);
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
float *Work = malloc(mWork * nWork * 2 * sizeof(float));
LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
// Recursive kernel
RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
// Free work space
free(Work);
}
/** cpbtrf's recursive compute kernel */
static void RELAPACK_cpbtrf_rec(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
float *Work, const int *ldWork,
int *info
){
if (*n <= MAX(CROSSOVER_CPBTRF, 1)) {
// Unblocked
LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
// Splitting
const int n1 = MIN(CREC_SPLIT(*n), *kd);
const int n2 = *n - n1;
// * *
// * Ab_BR
float *const Ab_BR = Ab + 2 * *ldAb * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
// Banded splitting
const int n21 = MIN(n2, *kd - n1);
const int n22 = MIN(n2 - n21, *kd);
// n1 n21 n22
// n1 * A_TRl A_TRr
// n21 A_BLt A_BRtl A_BRtr
// n22 A_BLb A_BRbl A_BRbr
float *const A_TRl = A_TR;
float *const A_TRr = A_TR + 2 * *ldA * n21;
float *const A_BLt = A_BL;
float *const A_BLb = A_BL + 2 * n21;
float *const A_BRtl = A_BR;
float *const A_BRtr = A_BR + 2 * *ldA * n21;
float *const A_BRbl = A_BR + 2 * n21;
float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21;
if (*uplo == 'L') {
// A_BLt = ABLt / A_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
// A_BRtl = A_BRtl - A_BLt * A_BLt'
BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
// Work = A_BLb
LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
// Work = Work / A_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
// A_BRbl = A_BRbl - Work * A_BLt'
BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
// A_BRbr = A_BRbr - Work * Work'
BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_BLb = Work
LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
} else {
// A_TRl = A_TL' \ A_TRl
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// A_BRtl = A_BRtl - A_TRl' * A_TRl
BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
// Work = A_TRr
LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
// Work = A_TL' \ Work
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
// A_BRtr = A_BRtr - A_TRl' * Work
BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Work' * Work
BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_TRr = Work
LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
}
// recursion(A_BR)
if (*kd > n1)
RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info);
else
RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
if (*info)
*info += n1;
}

92
relapack/src/cpotrf.c Normal file
View File

@@ -0,0 +1,92 @@
#include "relapack.h"
static void RELAPACK_cpotrf_rec(const char *, const int *, float *,
const int *, int *);
/** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
*
* This routine is functionally equivalent to LAPACK's cpotrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html
* */
void RELAPACK_cpotrf(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CPOTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info);
}
/** cpotrf's recursive compute kernel */
static void RELAPACK_cpotrf_rec(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_CPOTRF, 1)) {
// Unblocked
LAPACK(cpotf2)(uplo, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = A_BL / A_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
// A_BR = A_BR - A_BL * A_BL'
BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
} else {
// A_TR = A_TL' \ A_TR
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_TR' * A_TR
BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
}
// recursion(A_BR)
RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

238
relapack/src/csytrf.c Normal file
View File

@@ -0,0 +1,238 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's csytrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html
* */
void RELAPACK_csytrf(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy arguments
int nout;
// Recursive kernel
RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** csytrf's recursive compute kernel */
static void RELAPACK_csytrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CSYTRF, 3)) {
// Unblocked
if (top) {
LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Loop iterator
int i;
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

451
relapack/src/csytrf_rec2.c Normal file
View File

@@ -0,0 +1,451 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
*
* This routine is a minor modification of LAPACK's clasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int *
nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2, r__3, r__4;
complex q__1, q__2, q__3;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k;
static complex t, r1, d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static float alpha;
extern /* Subroutine */ int cscal_(int *, complex *, complex *,
int *);
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen), ccopy_(int *, complex *, int *,
complex *, int *), cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float absakk;
extern int icamax_(int *, complex *, int *);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
kstep = 1;
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
w_dim1]), dabs(r__2));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (kw - 1) * w_dim1]), dabs(r__2));
if (imax > 1) {
i__1 = imax - 1;
jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
i__1 = jmax + (kw - 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (kw - 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha *
rowmax) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kk - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = k - 1;
cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + kw * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
c_div(&q__1, &t, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
i__1 = *n - k + 1;
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
kstep = 1;
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
w_dim1]), dabs(r__2));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (k + 1) * w_dim1]), dabs(r__2));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
i__1 = jmax + (k + 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (k + 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha *
rowmax) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
k * w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k + kstep - 1;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kp - kk - 1;
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = *n - k;
cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k + k * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
c_div(&q__1, &t, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/csytrf_rook.c Normal file
View File

@@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's csytrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html
* */
void RELAPACK_csytrf_rook(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** csytrf_rook's recursive compute kernel */
static void RELAPACK_csytrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@@ -0,0 +1,565 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
*
* This routine is a minor modification of LAPACK's clasyf_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, complex *a, int *lda, int *ipiv,
complex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k, p;
static complex t, r1, d11, d12, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static float alpha;
extern /* Subroutine */ int cscal_(int *, complex *, complex *,
int *);
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen);
static float sfmin;
extern /* Subroutine */ int ccopy_(int *, complex *, int *,
complex *, int *);
static int itemp;
extern /* Subroutine */ int cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float stemp, absakk;
extern int icamax_(int *, complex *, int *);
extern double slamch_(char *, ftnlen);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
sfmin = slamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
w_dim1]), dabs(r__2));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
i__1 = itemp + (kw - 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (kw - 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha *
rowmax)) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = k - p;
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
c__1);
i__1 = *n - k + 1;
cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
lda);
i__1 = *n - kk + 1;
cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
i__1 = kp + k * a_dim1;
i__2 = kk + k * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = k - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
c__1);
i__1 = *n - kk + 1;
cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
lda);
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+ k * a_dim1]), dabs(r__2)) >= sfmin) {
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = k - 1;
cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else /* if(complicated condition) */ {
i__1 = k + k * a_dim1;
if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
a_dim1]);
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L14: */
}
}
}
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d12.r = w[i__1].r, d12.i = w[i__1].i;
c_div(&q__1, &w[k + kw * w_dim1], &d12);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d12);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d12);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
jj = j - 1;
if (jp1 != jj && kstep == 2) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = *n - k + 1;
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
}
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
w_dim1]), dabs(r__2));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
1) * w_dim1], &c__1, (ftnlen)12);
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (k + 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
i__1 = itemp + (k + 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (k + 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (k + 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha *
rowmax)) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p - k;
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
lda);
i__1 = *n - p + 1;
ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
c__1);
cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
i__1 = kp + k * a_dim1;
i__2 = kk + k * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kp - k - 1;
ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
* a_dim1], lda);
i__1 = *n - kp + 1;
ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
a_dim1], &c__1);
cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+ k * a_dim1]), dabs(r__2)) >= sfmin) {
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = *n - k;
cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else /* if(complicated condition) */ {
i__1 = k + k * a_dim1;
if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
a_dim1]);
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L74: */
}
}
}
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k + k * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
jj = j + 1;
if (jp1 != jj && kstep == 2) {
cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

268
relapack/src/ctgsyl.c Normal file
View File

@@ -0,0 +1,268 @@
#include "relapack.h"
#include <math.h>
static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *,
const int *, const float *, const int *, const float *, const int *,
float *, const int *, const float *, const int *, const float *,
const int *, float *, const int *, float *, float *, float *, int *);
/** CTGSYL solves the generalized Sylvester equation.
*
* This routine is functionally equivalent to LAPACK's ctgsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html
* */
void RELAPACK_ctgsyl(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
// Parse arguments
const int notran = LAPACK(lsame)(trans, "N");
const int tran = LAPACK(lsame)(trans, "C");
// Compute work buffer size
int lwmin = 1;
if (notran && (*ijob == 1 || *ijob == 2))
lwmin = MAX(1, 2 * *m * *n);
*info = 0;
// Check arguments
if (!tran && !notran)
*info = -1;
else if (notran && (*ijob < 0 || *ijob > 4))
*info = -2;
else if (*m <= 0)
*info = -3;
else if (*n <= 0)
*info = -4;
else if (*ldA < MAX(1, *m))
*info = -6;
else if (*ldB < MAX(1, *n))
*info = -8;
else if (*ldC < MAX(1, *m))
*info = -10;
else if (*ldD < MAX(1, *m))
*info = -12;
else if (*ldE < MAX(1, *n))
*info = -14;
else if (*ldF < MAX(1, *m))
*info = -16;
else if (*lWork < lwmin && *lWork != -1)
*info = -20;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CTGSYL", &minfo);
return;
}
if (*lWork == -1) {
// Work size query
*Work = lwmin;
return;
}
// Clean char * arguments
const char cleantrans = notran ? 'N' : 'C';
// Constant
const float ZERO[] = { 0., 0. };
int isolve = 1;
int ifunc = 0;
if (notran) {
if (*ijob >= 3) {
ifunc = *ijob - 2;
LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
} else if (*ijob >= 1)
isolve = 2;
}
float scale2;
int iround;
for (iround = 1; iround <= isolve; iround++) {
*scale = 1;
float dscale = 0;
float dsum = 1;
RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
if (dscale != 0) {
if (*ijob == 1 || *ijob == 3)
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
else
*dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
}
if (isolve == 2) {
if (iround == 1) {
if (notran)
ifunc = *ijob;
scale2 = *scale;
LAPACK(clacpy)("F", m, n, C, ldC, Work, m);
LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
} else {
LAPACK(clacpy)("F", m, n, Work, m, C, ldC);
LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
*scale = scale2;
}
}
}
}
/** ctgsyl's recursive vompute kernel */
static void RELAPACK_ctgsyl_rec(
const char *trans, const int *ifunc, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dsum, float *dscale,
int *info
) {
if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) {
// Unblocked
LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Outputs
float scale1[] = { 1., 0. };
float scale2[] = { 1., 0. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
const int m1 = CREC_SPLIT(*m);
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const float *const A_TL = A;
const float *const A_TR = A + 2 * *ldA * m1;
const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
// C_T
// C_B
float *const C_T = C;
float *const C_B = C + 2 * m1;
// D_TL D_TR
// 0 D_BR
const float *const D_TL = D;
const float *const D_TR = D + 2 * *ldD * m1;
const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1;
// F_T
// F_B
float *const F_T = F;
float *const F_B = F + 2 * m1;
if (*trans == 'N') {
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
// C_T = C_T - A_TR * C_B
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// F_T = F_T - D_TR * C_B
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
}
} else {
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
// C_B = C_B - A_TR^H * C_T
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// C_B = C_B - D_TR^H * F_T
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
}
}
} else {
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const float *const B_TL = B;
const float *const B_TR = B + 2 * *ldB * n1;
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// C_L C_R
float *const C_L = C;
float *const C_R = C + 2 * *ldC * n1;
// E_TL E_TR
// 0 E_BR
const float *const E_TL = E;
const float *const E_TR = E + 2 * *ldE * n1;
const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1;
// F_L F_R
float *const F_L = F;
float *const F_R = F + 2 * *ldF * n1;
if (*trans == 'N') {
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
// C_R = C_R + F_L * B_TR
BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
// F_R = F_R + F_L * E_TR
BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
}
} else {
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
// F_L = F_L + C_R * B_TR
BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
// F_L = F_L + F_R * E_TR
BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
}
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

163
relapack/src/ctrsyl.c Normal file
View File

@@ -0,0 +1,163 @@
#include "relapack.h"
static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *,
const int *, const int *, const float *, const int *, const float *,
const int *, float *, const int *, float *, int *);
/** CTRSYL solves the complex Sylvester matrix equation.
*
* This routine is functionally equivalent to LAPACK's ctrsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html
* */
void RELAPACK_ctrsyl(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
// Check arguments
const int notransA = LAPACK(lsame)(tranA, "N");
const int ctransA = LAPACK(lsame)(tranA, "C");
const int notransB = LAPACK(lsame)(tranB, "N");
const int ctransB = LAPACK(lsame)(tranB, "C");
*info = 0;
if (!ctransA && !notransA)
*info = -1;
else if (!ctransB && !notransB)
*info = -2;
else if (*isgn != 1 && *isgn != -1)
*info = -3;
else if (*m < 0)
*info = -4;
else if (*n < 0)
*info = -5;
else if (*ldA < MAX(1, *m))
*info = -7;
else if (*ldB < MAX(1, *n))
*info = -9;
else if (*ldC < MAX(1, *m))
*info = -11;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CTRSYL", &minfo);
return;
}
// Clean char * arguments
const char cleantranA = notransA ? 'N' : 'C';
const char cleantranB = notransB ? 'N' : 'C';
// Recursive kernel
RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
/** ctrsyl's recursive compute kernel */
static void RELAPACK_ctrsyl_rec(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) {
// Unblocked
RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const float MSGN[] = { -*isgn, 0. };
const int iONE[] = { 1 };
// Outputs
float scale1[] = { 1., 0. };
float scale2[] = { 1., 0. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
const int m1 = CREC_SPLIT(*m);
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const float *const A_TL = A;
const float *const A_TR = A + 2 * *ldA * m1;
const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
// C_T
// C_B
float *const C_T = C;
float *const C_B = C + 2 * m1;
if (*tranA == 'N') {
// recusion(A_BR, B, C_B)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
// C_T = C_T - A_TR * C_B
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// recusion(A_TL, B, C_T)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
} else {
// recusion(A_TL, B, C_T)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
// C_B = C_B - A_TR' * C_T
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// recusion(A_BR, B, C_B)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
}
} else {
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const float *const B_TL = B;
const float *const B_TR = B + 2 * *ldB * n1;
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// C_L C_R
float *const C_L = C;
float *const C_R = C + 2 * *ldC * n1;
if (*tranB == 'N') {
// recusion(A, B_TL, C_L)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
// C_R = C_R -/+ C_L * B_TR
BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
// recusion(A, B_BR, C_R)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
} else {
// recusion(A, B_BR, C_R)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
// C_L = C_L -/+ C_R * B_TR'
BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
// recusion(A, B_TL, C_L)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

392
relapack/src/ctrsyl_rec2.c Normal file
View File

@@ -0,0 +1,392 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "../config.h"
#include "f2c.h"
#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
extern void cdotu_(complex *, int *, complex *, int *, complex *, int *);
complex result;
cdotu_(&result, n, x, incx, y, incy);
return result;
}
#define cdotu_ cdotu_fun
complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
extern void cdotc_(complex *, int *, complex *, int *, complex *, int *);
complex result;
cdotc_(&result, n, x, incx, y, incy);
return result;
}
#define cdotc_ cdotc_fun
#endif
#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
complex cladiv_fun(complex *a, complex *b) {
extern void cladiv_(complex *, complex *, complex *);
complex result;
cladiv_(&result, a, b);
return result;
}
#define cladiv_ cladiv_fun
#endif
/* Table of constant values */
static int c__1 = 1;
/** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
*
* This routine is an exact copy of LAPACK's ctrsyl.
* It serves as an unblocked kernel in the recursive algorithms.
* */
/* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int
*isgn, int *m, int *n, complex *a, int *lda, complex *b,
int *ldb, complex *c__, int *ldc, float *scale, int *info,
ftnlen trana_len, ftnlen tranb_len)
{
/* System generated locals */
int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
i__3, i__4;
float r__1, r__2;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
float r_imag(complex *);
void r_cnjg(complex *, complex *);
/* Local variables */
static int j, k, l;
static complex a11;
static float db;
static complex x11;
static float da11;
static complex vec;
static float dum[1], eps, sgn, smin;
static complex suml, sumr;
/* Complex */ complex cdotc_(int *, complex *, int
*, complex *, int *);
extern int lsame_(char *, char *, ftnlen, ftnlen);
/* Complex */ complex cdotu_(int *, complex *, int
*, complex *, int *);
extern /* Subroutine */ int slabad_(float *, float *);
extern float clange_(char *, int *, int *, complex *,
int *, float *, ftnlen);
/* Complex */ complex cladiv_(complex *, complex *);
static float scaloc;
extern float slamch_(char *, ftnlen);
extern /* Subroutine */ int csscal_(int *, float *, complex *, int
*), xerbla_(char *, int *, ftnlen);
static float bignum;
static int notrna, notrnb;
static float smlnum;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
b_dim1 = *ldb;
b_offset = 1 + b_dim1;
b -= b_offset;
c_dim1 = *ldc;
c_offset = 1 + c_dim1;
c__ -= c_offset;
/* Function Body */
notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
*info = 0;
if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
*info = -1;
} else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
*info = -2;
} else if (*isgn != 1 && *isgn != -1) {
*info = -3;
} else if (*m < 0) {
*info = -4;
} else if (*n < 0) {
*info = -5;
} else if (*lda < max(1,*m)) {
*info = -7;
} else if (*ldb < max(1,*n)) {
*info = -9;
} else if (*ldc < max(1,*m)) {
*info = -11;
}
if (*info != 0) {
i__1 = -(*info);
xerbla_("CTRSY2", &i__1, (ftnlen)6);
return;
}
*scale = 1.f;
if (*m == 0 || *n == 0) {
return;
}
eps = slamch_("P", (ftnlen)1);
smlnum = slamch_("S", (ftnlen)1);
bignum = 1.f / smlnum;
slabad_(&smlnum, &bignum);
smlnum = smlnum * (float) (*m * *n) / eps;
bignum = 1.f / smlnum;
/* Computing MAX */
r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, (
ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n,
&b[b_offset], ldb, dum, (ftnlen)1);
smin = dmax(r__1,r__2);
sgn = (float) (*isgn);
if (notrna && notrnb) {
i__1 = *n;
for (l = 1; l <= i__1; ++l) {
for (k = *m; k >= 1; --k) {
i__2 = *m - k;
/* Computing MIN */
i__3 = k + 1;
/* Computing MIN */
i__4 = k + 1;
q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
min(i__4,*m) + l * c_dim1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__2 = l - 1;
q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
, &c__1);
sumr.r = q__1.r, sumr.i = q__1.i;
i__2 = k + l * c_dim1;
q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
i__2 = k + k * a_dim1;
i__3 = l + l * b_dim1;
q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i;
q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i;
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__2 = *n;
for (j = 1; j <= i__2; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L10: */
}
*scale *= scaloc;
}
i__2 = k + l * c_dim1;
c__[i__2].r = x11.r, c__[i__2].i = x11.i;
/* L20: */
}
/* L30: */
}
} else if (! notrna && notrnb) {
i__1 = *n;
for (l = 1; l <= i__1; ++l) {
i__2 = *m;
for (k = 1; k <= i__2; ++k) {
i__3 = k - 1;
q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
c_dim1 + 1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__3 = l - 1;
q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
, &c__1);
sumr.r = q__1.r, sumr.i = q__1.i;
i__3 = k + l * c_dim1;
q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
r_cnjg(&q__2, &a[k + k * a_dim1]);
i__3 = l + l * b_dim1;
q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L40: */
}
*scale *= scaloc;
}
i__3 = k + l * c_dim1;
c__[i__3].r = x11.r, c__[i__3].i = x11.i;
/* L50: */
}
/* L60: */
}
} else if (! notrna && ! notrnb) {
for (l = *n; l >= 1; --l) {
i__1 = *m;
for (k = 1; k <= i__1; ++k) {
i__2 = k - 1;
q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
c_dim1 + 1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__2 = *n - l;
/* Computing MIN */
i__3 = l + 1;
/* Computing MIN */
i__4 = l + 1;
q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
l + min(i__4,*n) * b_dim1], ldb);
sumr.r = q__1.r, sumr.i = q__1.i;
i__2 = k + l * c_dim1;
r_cnjg(&q__4, &sumr);
q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
i__2 = k + k * a_dim1;
i__3 = l + l * b_dim1;
q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i;
r_cnjg(&q__1, &q__2);
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__2 = *n;
for (j = 1; j <= i__2; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L70: */
}
*scale *= scaloc;
}
i__2 = k + l * c_dim1;
c__[i__2].r = x11.r, c__[i__2].i = x11.i;
/* L80: */
}
/* L90: */
}
} else if (notrna && ! notrnb) {
for (l = *n; l >= 1; --l) {
for (k = *m; k >= 1; --k) {
i__1 = *m - k;
/* Computing MIN */
i__2 = k + 1;
/* Computing MIN */
i__3 = k + 1;
q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
min(i__3,*m) + l * c_dim1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__1 = *n - l;
/* Computing MIN */
i__2 = l + 1;
/* Computing MIN */
i__3 = l + 1;
q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
l + min(i__3,*n) * b_dim1], ldb);
sumr.r = q__1.r, sumr.i = q__1.i;
i__1 = k + l * c_dim1;
r_cnjg(&q__4, &sumr);
q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
i__1 = k + k * a_dim1;
r_cnjg(&q__3, &b[l + l * b_dim1]);
q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i;
q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i;
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L100: */
}
*scale *= scaloc;
}
i__1 = k + l * c_dim1;
c__[i__1].r = x11.r, c__[i__1].i = x11.i;
/* L110: */
}
/* L120: */
}
}
return;
}

107
relapack/src/ctrtri.c Normal file
View File

@@ -0,0 +1,107 @@
#include "relapack.h"
static void RELAPACK_ctrtri_rec(const char *, const char *, const int *,
float *, const int *, int *);
/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
*
* This routine is functionally equivalent to LAPACK's ctrtri.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html
* */
void RELAPACK_ctrtri(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int nounit = LAPACK(lsame)(diag, "N");
const int unit = LAPACK(lsame)(diag, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (!nounit && !unit)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CTRTRI", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleandiag = nounit ? 'N' : 'U';
// check for singularity
if (nounit) {
int i;
for (i = 0; i < *n; i++)
if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
*info = i;
return;
}
}
// Recursive kernel
RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
}
/** ctrtri's recursive compute kernel */
static void RELAPACK_ctrtri_rec(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_CTRTRI, 1)) {
// Unblocked
LAPACK(ctrti2)(uplo, diag, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = - A_BL * A_TL
BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
// A_BL = A_BR \ A_BL
BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TR = - A_TL * A_TR
BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
// A_TR = A_TR / A_BR
BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

227
relapack/src/dgbtrf.c Normal file
View File

@@ -0,0 +1,227 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *,
const int *, double *, const int *, int *, double *, const int *, double *,
const int *, int *);
/** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's dgbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html
* */
void RELAPACK_dgbtrf(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kl < 0)
*info = -3;
else if (*ku < 0)
*info = -4;
else if (*ldAb < 2 * *kl + *ku + 1)
*info = -6;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DGBTRF", &minfo);
return;
}
// Constant
const double ZERO[] = { 0. };
// Result upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + kv;
// Zero upper diagonal fill-in elements
int i, j;
for (j = 0; j < *n; j++) {
double *const A_j = A + *ldA * j;
for (i = MAX(0, j - kv); i < j - *ku; i++)
A_j[i] = 0.;
}
// Allocate work space
const int n1 = DREC_SPLIT(*n);
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const int nWorkl = (kv > n1) ? n1 : kv;
const int mWorku = (*kl > n1) ? n1 : *kl;
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
double *Workl = malloc(mWorkl * nWorkl * sizeof(double));
double *Worku = malloc(mWorku * nWorku * sizeof(double));
LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
// Free work space
free(Workl);
free(Worku);
}
/** dgbtrf's recursive compute kernel */
static void RELAPACK_dgbtrf_rec(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
int *info
) {
if (*n <= MAX(CROSSOVER_DGBTRF, 1)) {
// Unblocked
LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
// Loop iterators
int i, j;
// Output upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + kv;
// Splitting
const int n1 = MIN(DREC_SPLIT(*n), *kl);
const int n2 = *n - n1;
const int m1 = MIN(n1, *m);
const int m2 = *m - m1;
const int mn1 = MIN(m1, n1);
const int mn2 = MIN(m2, n2);
// Ab_L *
// Ab_BR
double *const Ab_L = Ab;
double *const Ab_BR = Ab + *ldAb * n1;
// A_L A_R
double *const A_L = A;
double *const A_R = A + *ldA * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + m1;
double *const A_BR = A + *ldA * n1 + m1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// Banded splitting
const int n21 = MIN(n2, kv - n1);
const int n22 = MIN(n2 - n21, n1);
const int m21 = MIN(m2, *kl - m1);
const int m22 = MIN(m2 - m21, m1);
// n1 n21 n22
// m * A_Rl ARr
double *const A_Rl = A_R;
double *const A_Rr = A_R + *ldA * n21;
// n1 n21 n22
// m1 * A_TRl A_TRr
// m21 A_BLt A_BRtl A_BRtr
// m22 A_BLb A_BRbl A_BRbr
double *const A_TRl = A_TR;
double *const A_TRr = A_TR + *ldA * n21;
double *const A_BLt = A_BL;
double *const A_BLb = A_BL + m21;
double *const A_BRtl = A_BR;
double *const A_BRtr = A_BR + *ldA * n21;
double *const A_BRbl = A_BR + m21;
double *const A_BRbr = A_BR + *ldA * n21 + m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
// Workl = A_BLb
LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
// partially redo swaps in A_L
for (i = 0; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
else
BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
}
}
// apply pivots to A_Rl
LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
// apply pivots to A_Rr columnwise
for (j = 0; j < n22; j++) {
double *const A_Rrj = A_Rr + *ldA * j;
for (i = j; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
const double tmp = A_Rrj[i];
A_Rrj[i] = A_Rr[ip];
A_Rrj[ip] = tmp;
}
}
}
// A_TRl = A_TL \ A_TRl
BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// Worku = A_TRr
LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
// Worku = A_TL \ Worku
BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
// A_TRr = Worku
LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
// A_BRtl = A_BRtl - A_BLt * A_TRl
BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
// A_BRbl = A_BRbl - Workl * A_TRl
BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
// A_BRtr = A_BRtr - A_BLt * Worku
BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Workl * Worku
BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
// partially undo swaps in A_L
for (i = mn1 - 1; i >= 0; i--) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
else
BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
if (*info)
*info += n1;
// shift pivots
for (i = 0; i < mn2; i++)
ipiv_B[i] += n1;
}

165
relapack/src/dgemmt.c Normal file
View File

@@ -0,0 +1,165 @@
#include "relapack.h"
static void RELAPACK_dgemmt_rec(const char *, const char *, const char *,
const int *, const int *, const double *, const double *, const int *,
const double *, const int *, const double *, double *, const int *);
static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *,
const int *, const int *, const double *, const double *, const int *,
const double *, const int *, const double *, double *, const int *);
/** DGEMMT computes a matrix-matrix product with general matrices but updates
* only the upper or lower triangular part of the result matrix.
*
* This routine performs the same operation as the BLAS routine
* dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
* but only updates the triangular part of C specified by uplo:
* If (*uplo == 'L'), only the lower triangular part of C is updated,
* otherwise the upper triangular part is updated.
* */
void RELAPACK_dgemmt(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
#if HAVE_XGEMMT
BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
#else
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int notransA = LAPACK(lsame)(transA, "N");
const int tranA = LAPACK(lsame)(transA, "T");
const int notransB = LAPACK(lsame)(transB, "N");
const int tranB = LAPACK(lsame)(transB, "T");
int info = 0;
if (!lower && !upper)
info = 1;
else if (!tranA && !notransA)
info = 2;
else if (!tranB && !notransB)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*ldA < MAX(1, notransA ? *n : *k))
info = 8;
else if (*ldB < MAX(1, notransB ? *k : *n))
info = 10;
else if (*ldC < MAX(1, *n))
info = 13;
if (info) {
LAPACK(xerbla)("DGEMMT", &info);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleantransA = notransA ? 'N' : 'T';
const char cleantransB = notransB ? 'N' : 'T';
// Recursive kernel
RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
#endif
}
/** dgemmt's recursive compute kernel */
static void RELAPACK_dgemmt_rec(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
if (*n <= MAX(CROSSOVER_DGEMMT, 1)) {
// Unblocked
RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
}
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_T
// A_B
const double *const A_T = A;
const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);
// B_L B_R
const double *const B_L = B;
const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);
// C_TL C_TR
// C_BL C_BR
double *const C_TL = C;
double *const C_TR = C + *ldC * n1;
double *const C_BL = C + n1;
double *const C_BR = C + *ldC * n1 + n1;
// recursion(C_TL)
RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
if (*uplo == 'L')
// C_BL = alpha A_B B_L + beta C_BL
BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
else
// C_TR = alpha A_T B_R + beta C_TR
BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
// recursion(C_BR)
RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
}
/** dgemmt's unblocked compute kernel */
static void RELAPACK_dgemmt_rec2(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
const int incB = (*transB == 'N') ? 1 : *ldB;
const int incC = 1;
int i;
for (i = 0; i < *n; i++) {
// A_0
// A_i
const double *const A_0 = A;
const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i);
// * B_i *
const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i);
// * C_0i *
// * C_ii *
double *const C_0i = C + *ldC * i;
double *const C_ii = C + *ldC * i + i;
if (*uplo == 'L') {
const int nmi = *n - i;
if (*transA == 'N')
BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
else
BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
} else {
const int ip1 = i + 1;
if (*transA == 'N')
BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
else
BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
}
}
}

117
relapack/src/dgetrf.c Normal file
View File

@@ -0,0 +1,117 @@
#include "relapack.h"
static void RELAPACK_dgetrf_rec(const int *, const int *, double *,
const int *, int *, int *);
/** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's dgetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html
* */
void RELAPACK_dgetrf(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DGETRF", &minfo);
return;
}
const int sn = MIN(*m, *n);
RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
if (*m < *n) {
// Constants
const double ONE[] = { 1. };
const int iONE[] = { 1. };
// Splitting
const int rn = *n - *m;
// A_L A_R
const double *const A_L = A;
double *const A_R = A + *ldA * *m;
// A_R = apply(ipiv, A_R)
LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
// A_R = A_S \ A_R
BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
}
}
/** dgetrf's recursive compute kernel */
static void RELAPACK_dgetrf_rec(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
if (*n <= MAX(CROSSOVER_DGETRF, 1)) {
// Unblocked
LAPACK(dgetf2)(m, n, A, ldA, ipiv, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
const int m2 = *m - n1;
// A_L A_R
double *const A_L = A;
double *const A_R = A + *ldA * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// recursion(A_L, ipiv_T)
RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
// apply pivots to A_R
LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
// A_TR = A_TL \ A_TR
BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_BL * A_TR
BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
// recursion(A_BR, ipiv_B)
RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
if (*info)
*info += n1;
// apply pivots to A_BL
LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
// shift pivots
int i;
for (i = 0; i < n2; i++)
ipiv_B[i] += n1;
}

87
relapack/src/dlauum.c Normal file
View File

@@ -0,0 +1,87 @@
#include "relapack.h"
static void RELAPACK_dlauum_rec(const char *, const int *, double *,
const int *, int *);
/** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
*
* This routine is functionally equivalent to LAPACK's dlauum.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html
* */
void RELAPACK_dlauum(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DLAUUM", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info);
}
/** dlauum's recursive compute kernel */
static void RELAPACK_dlauum_rec(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_DLAUUM, 1)) {
// Unblocked
LAPACK(dlauu2)(uplo, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1. };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info);
if (*uplo == 'L') {
// A_TL = A_TL + A_BL' * A_BL
BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
// A_BL = A_BR' * A_BL
BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TL = A_TL + A_TR * A_TR'
BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
// A_TR = A_TR * A_BR'
BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info);
}

157
relapack/src/dpbtrf.c Normal file
View File

@@ -0,0 +1,157 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *,
double *, const int *, double *, const int *, int *);
/** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
*
* This routine is functionally equivalent to LAPACK's dpbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html
* */
void RELAPACK_dpbtrf(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kd < 0)
*info = -3;
else if (*ldAb < *kd + 1)
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DPBTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Constant
const double ZERO[] = { 0. };
// Allocate work space
const int n1 = DREC_SPLIT(*n);
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
double *Work = malloc(mWork * nWork * sizeof(double));
LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
// Recursive kernel
RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
// Free work space
free(Work);
}
/** dpbtrf's recursive compute kernel */
static void RELAPACK_dpbtrf_rec(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
double *Work, const int *ldWork,
int *info
){
if (*n <= MAX(CROSSOVER_DPBTRF, 1)) {
// Unblocked
LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
// Splitting
const int n1 = MIN(DREC_SPLIT(*n), *kd);
const int n2 = *n - n1;
// * *
// * Ab_BR
double *const Ab_BR = Ab + *ldAb * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
// Banded splitting
const int n21 = MIN(n2, *kd - n1);
const int n22 = MIN(n2 - n21, n1);
// n1 n21 n22
// n1 * A_TRl A_TRr
// n21 A_BLt A_BRtl A_BRtr
// n22 A_BLb A_BRbl A_BRbr
double *const A_TRl = A_TR;
double *const A_TRr = A_TR + *ldA * n21;
double *const A_BLt = A_BL;
double *const A_BLb = A_BL + n21;
double *const A_BRtl = A_BR;
double *const A_BRtr = A_BR + *ldA * n21;
double *const A_BRbl = A_BR + n21;
double *const A_BRbr = A_BR + *ldA * n21 + n21;
if (*uplo == 'L') {
// A_BLt = ABLt / A_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
// A_BRtl = A_BRtl - A_BLt * A_BLt'
BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
// Work = A_BLb
LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
// Work = Work / A_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
// A_BRbl = A_BRbl - Work * A_BLt'
BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
// A_BRbr = A_BRbr - Work * Work'
BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_BLb = Work
LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
} else {
// A_TRl = A_TL' \ A_TRl
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// A_BRtl = A_BRtl - A_TRl' * A_TRl
BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
// Work = A_TRr
LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
// Work = A_TL' \ Work
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
// A_BRtr = A_BRtr - A_TRl' * Work
BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Work' * Work
BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_TRr = Work
LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
}
// recursion(A_BR)
if (*kd > n1)
RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info);
else
RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
if (*info)
*info += n1;
}

92
relapack/src/dpotrf.c Normal file
View File

@@ -0,0 +1,92 @@
#include "relapack.h"
static void RELAPACK_dpotrf_rec(const char *, const int *, double *,
const int *, int *);
/** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
*
* This routine is functionally equivalent to LAPACK's dpotrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
* */
void RELAPACK_dpotrf(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DPOTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info);
}
/** dpotrf's recursive compute kernel */
static void RELAPACK_dpotrf_rec(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_DPOTRF, 1)) {
// Unblocked
LAPACK(dpotf2)(uplo, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = A_BL / A_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
// A_BR = A_BR - A_BL * A_BL'
BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
} else {
// A_TR = A_TL' \ A_TR
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_TR' * A_TR
BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
}
// recursion(A_BR)
RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

212
relapack/src/dsygst.c Normal file
View File

@@ -0,0 +1,212 @@
#include "relapack.h"
#if XSYGST_ALLOW_MALLOC
#include "stdlib.h"
#endif
static void RELAPACK_dsygst_rec(const int *, const char *, const int *,
double *, const int *, const double *, const int *,
double *, const int *, int *);
/** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
*
* This routine is functionally equivalent to LAPACK's dsygst.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html
* */
void RELAPACK_dsygst(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (*itype < 1 || *itype > 3)
*info = -1;
else if (!lower && !upper)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
else if (*ldB < MAX(1, *n))
*info = -7;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DSYGST", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Allocate work space
double *Work = NULL;
int lWork = 0;
#if XSYGST_ALLOW_MALLOC
const int n1 = DREC_SPLIT(*n);
lWork = n1 * (*n - n1);
Work = malloc(lWork * sizeof(double));
if (!Work)
lWork = 0;
#endif
// recursive kernel
RELAPACK_dsygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
// Free work space
#if XSYGST_ALLOW_MALLOC
if (Work)
free(Work);
#endif
}
/** dsygst's recursive compute kernel */
static void RELAPACK_dsygst_rec(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
double *Work, const int *lWork, int *info
) {
if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
// Unblocked
LAPACK(dsygs2)(itype, uplo, n, A, ldA, B, ldB, info);
return;
}
// Constants
const double ZERO[] = { 0. };
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const double HALF[] = { .5 };
const double MHALF[] = { -.5 };
const int iONE[] = { 1 };
// Loop iterator
int i;
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// B_TL B_TR
// B_BL B_BR
const double *const B_TL = B;
const double *const B_TR = B + *ldB * n1;
const double *const B_BL = B + n1;
const double *const B_BR = B + *ldB * n1 + n1;
// recursion(A_TL, B_TL)
RELAPACK_dsygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
if (*itype == 1)
if (*uplo == 'L') {
// A_BL = A_BL / B_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * B_BL * A_TL
BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
} else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
BLAS(dsyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR \ A_BL
BLAS(dtrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL' \ A_TR
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * A_TL * B_TR
BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
} else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
BLAS(dsyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR / B_BR
BLAS(dtrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
else
if (*uplo == 'L') {
// A_BL = A_BL * B_TL
BLAS(dtrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * A_BR * B_BL
BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
} else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
BLAS(dsyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR * A_BL
BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL * A_TR
BLAS(dtrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * B_TR * A_BR
BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
} else
// A_TR = A_TR + 1/2 B_TR A_BR
BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
BLAS(dsyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
else
// A_TR = A_TR + 1/2 B_TR * A_BR
BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR * B_BR
BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
// recursion(A_BR, B_BR)
RELAPACK_dsygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
}

238
relapack/src/dsytrf.c Normal file
View File

@@ -0,0 +1,238 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's dsytrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html
* */
void RELAPACK_dsytrf(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy arguments
int nout;
// Recursive kernel
RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** dsytrf's recursive compute kernel */
static void RELAPACK_dsytrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_DSYTRF, 3)) {
// Unblocked
if (top) {
LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_dsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
// Loop iterator
int i;
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = DREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
double *const A_BL_B = A + *n;
double *const A_BR_B = A + *ldA * n1 + *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + n1;
double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = DREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + *ldA * n_rest;
double *const A_TR_T = A + *ldA * (n_rest + n1);
double *const A_TL = A + *ldA * n_rest + n_rest;
double *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

352
relapack/src/dsytrf_rec2.c Normal file
View File

@@ -0,0 +1,352 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static int c__1 = 1;
static double c_b8 = -1.;
static double c_b9 = 1.;
/** DSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
*
* This routine is a minor modification of LAPACK's dlasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int *
nb, int *kb, double *a, int *lda, int *ipiv,
double *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
double d__1, d__2, d__3;
/* Builtin functions */
double sqrt(double);
/* Local variables */
static int j, k;
static double t, r1, d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static double alpha;
extern /* Subroutine */ int dscal_(int *, double *, double *,
int *);
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int dgemv_(char *, int *, int *,
double *, double *, int *, double *, int *,
double *, double *, int *, ftnlen), dcopy_(int *,
double *, int *, double *, int *), dswap_(int
*, double *, int *, double *, int *);
static int kstep;
static double absakk;
extern int idamax_(int *, double *, int *);
static double colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
kstep = 1;
absakk = (d__1 = w[k + kw * w_dim1], abs(d__1));
if (k > 1) {
i__1 = k - 1;
imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
i__1 = k - imax;
jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1));
if (imax > 1) {
i__1 = imax - 1;
jmax = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
d__2 = rowmax, d__3 = (d__1 = w[jmax + (kw - 1) * w_dim1],
abs(d__1));
rowmax = max(d__2,d__3);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else if ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) >=
alpha * rowmax) {
kp = imax;
dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
i__1 = kk - 1 - kp;
dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
dcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
dswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
r1 = 1. / a[k + k * a_dim1];
i__1 = k - 1;
dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
if (k > 2) {
d21 = w[k - 1 + kw * w_dim1];
d11 = w[k + kw * w_dim1] / d21;
d22 = w[k - 1 + (kw - 1) * w_dim1] / d21;
t = 1. / (d11 * d22 - 1.);
d21 = t / d21;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1)
* w_dim1] - w[j + kw * w_dim1]);
a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] -
w[j + (kw - 1) * w_dim1]);
/* L20: */
}
}
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
a[k + k * a_dim1] = w[k + kw * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
dswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
i__1 = *n - k + 1;
dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12);
kstep = 1;
absakk = (d__1 = w[k + k * w_dim1], abs(d__1));
if (k < *n) {
i__1 = *n - k;
imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
colmax = (d__1 = w[imax + k * w_dim1], abs(d__1));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax - k;
jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
d__2 = rowmax, d__3 = (d__1 = w[jmax + (k + 1) * w_dim1],
abs(d__1));
rowmax = max(d__2,d__3);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else if ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) >=
alpha * rowmax) {
kp = imax;
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
kk = k + kstep - 1;
if (kp != kk) {
a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
i__1 = kp - kk - 1;
dcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
dswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
r1 = 1. / a[k + k * a_dim1];
i__1 = *n - k;
dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
}
} else {
if (k < *n - 1) {
d21 = w[k + 1 + k * w_dim1];
d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
d22 = w[k + k * w_dim1] / d21;
t = 1. / (d11 * d22 - 1.);
d21 = t / d21;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] -
w[j + (k + 1) * w_dim1]);
a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) *
w_dim1] - w[j + k * w_dim1]);
/* L80: */
}
}
a[k + k * a_dim1] = w[k + k * w_dim1];
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
dswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/dsytrf_rook.c Normal file
View File

@@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's dsytrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html
* */
void RELAPACK_dsytrf_rook(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** dsytrf_rook's recursive compute kernel */
static void RELAPACK_dsytrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_dsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = DREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
double *const A_BL_B = A + *n;
double *const A_BR_B = A + *ldA * n1 + *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + n1;
double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = DREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + *ldA * n_rest;
double *const A_TR_T = A + *ldA * (n_rest + n1);
double *const A_TL = A + *ldA * n_rest + n_rest;
double *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@@ -0,0 +1,451 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static int c__1 = 1;
static double c_b9 = -1.;
static double c_b10 = 1.;
/** DSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method.
*
* This routine is a minor modification of LAPACK's dlasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, double *a, int *lda, int *ipiv,
double *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
double d__1;
/* Builtin functions */
double sqrt(double);
/* Local variables */
static int j, k, p;
static double t, r1, d11, d12, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static double alpha;
extern /* Subroutine */ int dscal_(int *, double *, double *,
int *);
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int dgemv_(char *, int *, int *,
double *, double *, int *, double *, int *,
double *, double *, int *, ftnlen);
static double dtemp, sfmin;
static int itemp;
extern /* Subroutine */ int dcopy_(int *, double *, int *,
double *, int *), dswap_(int *, double *, int
*, double *, int *);
static int kstep;
extern double dlamch_(char *, ftnlen);
static double absakk;
extern int idamax_(int *, double *, int *);
static double colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
sfmin = dlamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
absakk = (d__1 = w[k + kw * w_dim1], abs(d__1));
if (k > 1) {
i__1 = k - 1;
imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1));
} else {
rowmax = 0.;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
dtemp = (d__1 = w[itemp + (kw - 1) * w_dim1], abs(d__1));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
if (! ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) <
alpha * rowmax)) {
kp = imax;
dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = k - p;
dcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
dcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
c__1);
i__1 = *n - k + 1;
dswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
lda);
i__1 = *n - kk + 1;
dswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
a[kp + k * a_dim1] = a[kk + k * a_dim1];
i__1 = k - 1 - kp;
dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
dcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
c__1);
i__1 = *n - kk + 1;
dswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
lda);
i__1 = *n - kk + 1;
dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) {
r1 = 1. / a[k + k * a_dim1];
i__1 = k - 1;
dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else if (a[k + k * a_dim1] != 0.) {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
a[ii + k * a_dim1] /= a[k + k * a_dim1];
/* L14: */
}
}
}
} else {
if (k > 2) {
d12 = w[k - 1 + kw * w_dim1];
d11 = w[k + kw * w_dim1] / d12;
d22 = w[k - 1 + (kw - 1) * w_dim1] / d12;
t = 1. / (d11 * d22 - 1.);
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) *
w_dim1] - w[j + kw * w_dim1]) / d12);
a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] -
w[j + (kw - 1) * w_dim1]) / d12);
/* L20: */
}
}
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
a[k + k * a_dim1] = w[k + kw * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
dswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
jj = j - 1;
if (jp1 != jj && kstep == 2) {
i__1 = *n - j + 1;
dswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = *n - k + 1;
dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
}
absakk = (d__1 = w[k + k * w_dim1], abs(d__1));
if (k < *n) {
i__1 = *n - k;
imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
colmax = (d__1 = w[imax + k * w_dim1], abs(d__1));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k
+ 1) * w_dim1], &c__1, (ftnlen)12);
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1));
} else {
rowmax = 0.;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
dtemp = (d__1 = w[itemp + (k + 1) * w_dim1], abs(d__1));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
if (! ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) < alpha
* rowmax)) {
kp = imax;
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p - k;
dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
lda);
i__1 = *n - p + 1;
dcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
c__1);
dswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
dswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
a[kp + k * a_dim1] = a[kk + k * a_dim1];
i__1 = kp - k - 1;
dcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
* a_dim1], lda);
i__1 = *n - kp + 1;
dcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
a_dim1], &c__1);
dswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) {
r1 = 1. / a[k + k * a_dim1];
i__1 = *n - k;
dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else if (a[k + k * a_dim1] != 0.) {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
a[ii + k * a_dim1] /= a[k + k * a_dim1];
/* L74: */
}
}
}
} else {
if (k < *n - 1) {
d21 = w[k + 1 + k * w_dim1];
d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
d22 = w[k + k * w_dim1] / d21;
t = 1. / (d11 * d22 - 1.);
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[
j + (k + 1) * w_dim1]) / d21);
a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) *
w_dim1] - w[j + k * w_dim1]) / d21);
/* L80: */
}
}
a[k + k * a_dim1] = w[k + k * w_dim1];
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
dswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
jj = j + 1;
if (jp1 != jj && kstep == 2) {
dswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

274
relapack/src/dtgsyl.c Normal file
View File

@@ -0,0 +1,274 @@
#include "relapack.h"
#include <math.h>
static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *,
const int *, const double *, const int *, const double *, const int *,
double *, const int *, const double *, const int *, const double *,
const int *, double *, const int *, double *, double *, double *, int *,
int *, int *);
/** DTGSYL solves the generalized Sylvester equation.
*
* This routine is functionally equivalent to LAPACK's dtgsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html
* */
void RELAPACK_dtgsyl(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
// Parse arguments
const int notran = LAPACK(lsame)(trans, "N");
const int tran = LAPACK(lsame)(trans, "T");
// Compute work buffer size
int lwmin = 1;
if (notran && (*ijob == 1 || *ijob == 2))
lwmin = MAX(1, 2 * *m * *n);
*info = 0;
// Check arguments
if (!tran && !notran)
*info = -1;
else if (notran && (*ijob < 0 || *ijob > 4))
*info = -2;
else if (*m <= 0)
*info = -3;
else if (*n <= 0)
*info = -4;
else if (*ldA < MAX(1, *m))
*info = -6;
else if (*ldB < MAX(1, *n))
*info = -8;
else if (*ldC < MAX(1, *m))
*info = -10;
else if (*ldD < MAX(1, *m))
*info = -12;
else if (*ldE < MAX(1, *n))
*info = -14;
else if (*ldF < MAX(1, *m))
*info = -16;
else if (*lWork < lwmin && *lWork != -1)
*info = -20;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DTGSYL", &minfo);
return;
}
if (*lWork == -1) {
// Work size query
*Work = lwmin;
return;
}
// Clean char * arguments
const char cleantrans = notran ? 'N' : 'T';
// Constant
const double ZERO[] = { 0. };
int isolve = 1;
int ifunc = 0;
if (notran) {
if (*ijob >= 3) {
ifunc = *ijob - 2;
LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF);
} else if (*ijob >= 1)
isolve = 2;
}
double scale2;
int iround;
for (iround = 1; iround <= isolve; iround++) {
*scale = 1;
double dscale = 0;
double dsum = 1;
int pq;
RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
if (dscale != 0) {
if (*ijob == 1 || *ijob == 3)
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
else
*dif = sqrt(pq) / (dscale * sqrt(dsum));
}
if (isolve == 2) {
if (iround == 1) {
if (notran)
ifunc = *ijob;
scale2 = *scale;
LAPACK(dlacpy)("F", m, n, C, ldC, Work, m);
LAPACK(dlacpy)("F", m, n, F, ldF, Work + *m * *n, m);
LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF);
} else {
LAPACK(dlacpy)("F", m, n, Work, m, C, ldC);
LAPACK(dlacpy)("F", m, n, Work + *m * *n, m, F, ldF);
*scale = scale2;
}
}
}
}
/** dtgsyl's recursive vompute kernel */
static void RELAPACK_dtgsyl_rec(
const char *trans, const int *ifunc, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dsum, double *dscale,
int *iWork, int *pq, int *info
) {
if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) {
// Unblocked
LAPACK(dtgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
// Outputs
double scale1[] = { 1. };
double scale2[] = { 1. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
int m1 = DREC_SPLIT(*m);
if (A[m1 + *ldA * (m1 - 1)])
m1++;
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const double *const A_TL = A;
const double *const A_TR = A + *ldA * m1;
const double *const A_BR = A + *ldA * m1 + m1;
// C_T
// C_B
double *const C_T = C;
double *const C_B = C + m1;
// D_TL D_TR
// 0 D_BR
const double *const D_TL = D;
const double *const D_TR = D + *ldD * m1;
const double *const D_BR = D + *ldD * m1 + m1;
// F_T
// F_B
double *const F_T = F;
double *const F_B = F + m1;
if (*trans == 'N') {
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1);
// C_T = C_T - A_TR * C_B
BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// F_T = F_T - D_TR * C_B
BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
}
} else {
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
// C_B = C_B - A_TR^H * C_T
BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// C_B = C_B - D_TR^H * F_T
BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
}
}
} else {
// Splitting
int n1 = DREC_SPLIT(*n);
if (B[n1 + *ldB * (n1 - 1)])
n1++;
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const double *const B_TL = B;
const double *const B_TR = B + *ldB * n1;
const double *const B_BR = B + *ldB * n1 + n1;
// C_L C_R
double *const C_L = C;
double *const C_R = C + *ldC * n1;
// E_TL E_TR
// 0 E_BR
const double *const E_TL = E;
const double *const E_TR = E + *ldE * n1;
const double *const E_BR = E + *ldE * n1 + n1;
// F_L F_R
double *const F_L = F;
double *const F_R = F + *ldF * n1;
if (*trans == 'N') {
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1);
// C_R = C_R + F_L * B_TR
BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
// F_R = F_R + F_L * E_TR
BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
}
} else {
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
// F_L = F_L + C_R * B_TR
BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
// F_L = F_L + F_R * E_TR
BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
}
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

169
relapack/src/dtrsyl.c Normal file
View File

@@ -0,0 +1,169 @@
#include "relapack.h"
static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *,
const int *, const int *, const double *, const int *, const double *,
const int *, double *, const int *, double *, int *);
/** DTRSYL solves the real Sylvester matrix equation.
*
* This routine is functionally equivalent to LAPACK's dtrsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html
* */
void RELAPACK_dtrsyl(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
// Check arguments
const int notransA = LAPACK(lsame)(tranA, "N");
const int transA = LAPACK(lsame)(tranA, "T");
const int ctransA = LAPACK(lsame)(tranA, "C");
const int notransB = LAPACK(lsame)(tranB, "N");
const int transB = LAPACK(lsame)(tranB, "T");
const int ctransB = LAPACK(lsame)(tranB, "C");
*info = 0;
if (!transA && !ctransA && !notransA)
*info = -1;
else if (!transB && !ctransB && !notransB)
*info = -2;
else if (*isgn != 1 && *isgn != -1)
*info = -3;
else if (*m < 0)
*info = -4;
else if (*n < 0)
*info = -5;
else if (*ldA < MAX(1, *m))
*info = -7;
else if (*ldB < MAX(1, *n))
*info = -9;
else if (*ldC < MAX(1, *m))
*info = -11;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DTRSYL", &minfo);
return;
}
// Clean char * arguments
const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C');
const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C');
// Recursive kernel
RELAPACK_dtrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
/** dtrsyl's recursive compute kernel */
static void RELAPACK_dtrsyl_rec(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) {
// Unblocked
RELAPACK_dtrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const double MSGN[] = { -*isgn };
const int iONE[] = { 1 };
// Outputs
double scale1[] = { 1. };
double scale2[] = { 1. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
int m1 = DREC_SPLIT(*m);
if (A[m1 + *ldA * (m1 - 1)])
m1++;
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const double *const A_TL = A;
const double *const A_TR = A + *ldA * m1;
const double *const A_BR = A + *ldA * m1 + m1;
// C_T
// C_B
double *const C_T = C;
double *const C_B = C + m1;
if (*tranA == 'N') {
// recusion(A_BR, B, C_B)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
// C_T = C_T - A_TR * C_B
BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// recusion(A_TL, B, C_T)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
} else {
// recusion(A_TL, B, C_T)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
// C_B = C_B - A_TR' * C_T
BLAS(dgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// recusion(A_BR, B, C_B)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
}
} else {
// Splitting
int n1 = DREC_SPLIT(*n);
if (B[n1 + *ldB * (n1 - 1)])
n1++;
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const double *const B_TL = B;
const double *const B_TR = B + *ldB * n1;
const double *const B_BR = B + *ldB * n1 + n1;
// C_L C_R
double *const C_L = C;
double *const C_R = C + *ldC * n1;
if (*tranB == 'N') {
// recusion(A, B_TL, C_L)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
// C_R = C_R -/+ C_L * B_TR
BLAS(dgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
// recusion(A, B_BR, C_R)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
} else {
// recusion(A, B_BR, C_R)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
// C_L = C_L -/+ C_R * B_TR'
BLAS(dgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
// recusion(A, B_TL, C_L)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

1034
relapack/src/dtrsyl_rec2.c Normal file

File diff suppressed because it is too large Load Diff

107
relapack/src/dtrtri.c Normal file
View File

@@ -0,0 +1,107 @@
#include "relapack.h"
static void RELAPACK_dtrtri_rec(const char *, const char *, const int *,
double *, const int *, int *);
/** DTRTRI computes the inverse of a real upper or lower triangular matrix A.
*
* This routine is functionally equivalent to LAPACK's dtrtri.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html
* */
void RELAPACK_dtrtri(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int nounit = LAPACK(lsame)(diag, "N");
const int unit = LAPACK(lsame)(diag, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (!nounit && !unit)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DTRTRI", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleandiag = nounit ? 'N' : 'U';
// check for singularity
if (nounit) {
int i;
for (i = 0; i < *n; i++)
if (A[i + *ldA * i] == 0) {
*info = i;
return;
}
}
// Recursive kernel
RELAPACK_dtrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
}
/** dtrtri's recursive compute kernel */
static void RELAPACK_dtrtri_rec(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_DTRTRI, 1)) {
// Unblocked
LAPACK(dtrti2)(uplo, diag, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dtrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = - A_BL * A_TL
BLAS(dtrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
// A_BL = A_BR \ A_BL
BLAS(dtrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TR = - A_TL * A_TR
BLAS(dtrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
// A_TR = A_TR / A_BR
BLAS(dtrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_dtrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

109
relapack/src/f2c.c Normal file
View File

@@ -0,0 +1,109 @@
#include "stdlib.h"
#include "stdio.h"
#include "signal.h"
#include "f2c.h"
#ifndef SIGIOT
#ifdef SIGABRT
#define SIGIOT SIGABRT
#endif
#endif
void sig_die(const char *s, int kill) {
/* print error message, then clear buffers */
fprintf(stderr, "%s\n", s);
if(kill) {
fflush(stderr);
/* now get a core */
signal(SIGIOT, SIG_DFL);
abort();
} else
exit(1);
}
void c_div(complex *c, complex *a, complex *b) {
double ratio, den;
double abr, abi, cr;
if( (abr = b->r) < 0.)
abr = - abr;
if( (abi = b->i) < 0.)
abi = - abi;
if( abr <= abi ) {
if(abi == 0) {
#ifdef IEEE_COMPLEX_DIVIDE
float af, bf;
af = bf = abr;
if (a->i != 0 || a->r != 0)
af = 1.;
c->i = c->r = af / bf;
return;
#else
sig_die("complex division by zero", 1);
#endif
}
ratio = (double)b->r / b->i ;
den = b->i * (1 + ratio*ratio);
cr = (a->r*ratio + a->i) / den;
c->i = (a->i*ratio - a->r) / den;
} else {
ratio = (double)b->i / b->r ;
den = b->r * (1 + ratio*ratio);
cr = (a->r + a->i*ratio) / den;
c->i = (a->i - a->r*ratio) / den;
}
c->r = cr;
}
void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) {
double ratio, den;
double abr, abi, cr;
if( (abr = b->r) < 0.)
abr = - abr;
if( (abi = b->i) < 0.)
abi = - abi;
if( abr <= abi ) {
if(abi == 0) {
#ifdef IEEE_COMPLEX_DIVIDE
if (a->i != 0 || a->r != 0)
abi = 1.;
c->i = c->r = abi / abr;
return;
#else
sig_die("complex division by zero", 1);
#endif
}
ratio = b->r / b->i ;
den = b->i * (1 + ratio*ratio);
cr = (a->r*ratio + a->i) / den;
c->i = (a->i*ratio - a->r) / den;
} else {
ratio = b->i / b->r ;
den = b->r * (1 + ratio*ratio);
cr = (a->r + a->i*ratio) / den;
c->i = (a->i - a->r*ratio) / den;
}
c->r = cr;
}
float r_imag(complex *z) {
return z->i;
}
void r_cnjg(complex *r, complex *z) {
float zi = z->i;
r->r = z->r;
r->i = -zi;
}
double d_imag(doublecomplex *z) {
return z->i;
}
void d_cnjg(doublecomplex *r, doublecomplex *z) {
double zi = z->i;
r->r = z->r;
r->i = -zi;
}

223
relapack/src/f2c.h Normal file
View File

@@ -0,0 +1,223 @@
/* f2c.h -- Standard Fortran to C header file */
/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed."
- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
#ifndef F2C_INCLUDE
#define F2C_INCLUDE
typedef long int integer;
typedef unsigned long int uinteger;
typedef char *address;
typedef short int shortint;
typedef float real;
typedef double doublereal;
typedef struct { real r, i; } complex;
typedef struct { doublereal r, i; } doublecomplex;
typedef long int logical;
typedef short int shortlogical;
typedef char logical1;
typedef char integer1;
#ifdef INTEGER_STAR_8 /* Adjust for integer*8. */
typedef long long longint; /* system-dependent */
typedef unsigned long long ulongint; /* system-dependent */
#define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b)))
#define qbit_set(a,b) ((a) | ((ulongint)1 << (b)))
#endif
#define TRUE_ (1)
#define FALSE_ (0)
/* Extern is for use with -E */
#ifndef Extern
#define Extern extern
#endif
/* I/O stuff */
#ifdef f2c_i2
/* for -i2 */
typedef short flag;
typedef short ftnlen;
typedef short ftnint;
#else
typedef long int flag;
typedef long int ftnlen;
typedef long int ftnint;
#endif
/*external read, write*/
typedef struct
{ flag cierr;
ftnint ciunit;
flag ciend;
char *cifmt;
ftnint cirec;
} cilist;
/*internal read, write*/
typedef struct
{ flag icierr;
char *iciunit;
flag iciend;
char *icifmt;
ftnint icirlen;
ftnint icirnum;
} icilist;
/*open*/
typedef struct
{ flag oerr;
ftnint ounit;
char *ofnm;
ftnlen ofnmlen;
char *osta;
char *oacc;
char *ofm;
ftnint orl;
char *oblnk;
} olist;
/*close*/
typedef struct
{ flag cerr;
ftnint cunit;
char *csta;
} cllist;
/*rewind, backspace, endfile*/
typedef struct
{ flag aerr;
ftnint aunit;
} alist;
/* inquire */
typedef struct
{ flag inerr;
ftnint inunit;
char *infile;
ftnlen infilen;
ftnint *inex; /*parameters in standard's order*/
ftnint *inopen;
ftnint *innum;
ftnint *innamed;
char *inname;
ftnlen innamlen;
char *inacc;
ftnlen inacclen;
char *inseq;
ftnlen inseqlen;
char *indir;
ftnlen indirlen;
char *infmt;
ftnlen infmtlen;
char *inform;
ftnint informlen;
char *inunf;
ftnlen inunflen;
ftnint *inrecl;
ftnint *innrec;
char *inblank;
ftnlen inblanklen;
} inlist;
#define VOID void
union Multitype { /* for multiple entry points */
integer1 g;
shortint h;
integer i;
/* longint j; */
real r;
doublereal d;
complex c;
doublecomplex z;
};
typedef union Multitype Multitype;
/*typedef long int Long;*/ /* No longer used; formerly in Namelist */
struct Vardesc { /* for Namelist */
char *name;
char *addr;
ftnlen *dims;
int type;
};
typedef struct Vardesc Vardesc;
struct Namelist {
char *name;
Vardesc **vars;
int nvars;
};
typedef struct Namelist Namelist;
#define abs(x) ((x) >= 0 ? (x) : -(x))
#define dabs(x) (doublereal)abs(x)
#define min(a,b) ((a) <= (b) ? (a) : (b))
#define max(a,b) ((a) >= (b) ? (a) : (b))
#define dmin(a,b) (doublereal)min(a,b)
#define dmax(a,b) (doublereal)max(a,b)
#define bit_test(a,b) ((a) >> (b) & 1)
#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b)))
#define bit_set(a,b) ((a) | ((uinteger)1 << (b)))
/* procedure parameter types for -A and -C++ */
#define F2C_proc_par_types 1
#ifdef __cplusplus
typedef int /* Unknown procedure type */ (*U_fp)(...);
typedef shortint (*J_fp)(...);
typedef integer (*I_fp)(...);
typedef real (*R_fp)(...);
typedef doublereal (*D_fp)(...), (*E_fp)(...);
typedef /* Complex */ VOID (*C_fp)(...);
typedef /* Double Complex */ VOID (*Z_fp)(...);
typedef logical (*L_fp)(...);
typedef shortlogical (*K_fp)(...);
typedef /* Character */ VOID (*H_fp)(...);
typedef /* Subroutine */ int (*S_fp)(...);
#else
typedef int /* Unknown procedure type */ (*U_fp)();
typedef shortint (*J_fp)();
typedef integer (*I_fp)();
typedef real (*R_fp)();
typedef doublereal (*D_fp)(), (*E_fp)();
typedef /* Complex */ VOID (*C_fp)();
typedef /* Double Complex */ VOID (*Z_fp)();
typedef logical (*L_fp)();
typedef shortlogical (*K_fp)();
typedef /* Character */ VOID (*H_fp)();
typedef /* Subroutine */ int (*S_fp)();
#endif
/* E_fp is for real functions when -R is not specified */
typedef VOID C_f; /* complex function */
typedef VOID H_f; /* character function */
typedef VOID Z_f; /* double complex function */
typedef doublereal E_f; /* real function with -R not specified */
/* undef any lower-case symbols that your C compiler predefines, e.g.: */
#ifndef Skip_f2c_Undefs
#undef cray
#undef gcos
#undef mc68010
#undef mc68020
#undef mips
#undef pdp11
#undef sgi
#undef sparc
#undef sun
#undef sun2
#undef sun3
#undef sun4
#undef u370
#undef u3b
#undef u3b2
#undef u3b5
#undef unix
#undef vax
#endif
#endif

80
relapack/src/lapack.h Normal file
View File

@@ -0,0 +1,80 @@
#ifndef LAPACK_H
#define LAPACK_H
extern int LAPACK(lsame)(const char *, const char *);
extern int LAPACK(xerbla)(const char *, const int *);
extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *);
extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *);
extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *);
extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *);
extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *);
extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *);
extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *);
extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *);
extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *);
extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *);
extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *);
extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *);
extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *);
extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *);
extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *);
extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *);
extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *);
extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *);
extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *);
extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *);
extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *);
extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *);
#endif /* LAPACK_H */

View File

@@ -0,0 +1,607 @@
#include "relapack.h"
////////////
// XLAUUM //
////////////
#if INCLUDE_SLAUUM
void LAPACK(slauum)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_slauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DLAUUM
void LAPACK(dlauum)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dlauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CLAUUM
void LAPACK(clauum)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_clauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZLAUUM
void LAPACK(zlauum)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_zlauum(uplo, n, A, ldA, info);
}
#endif
////////////
// XSYGST //
////////////
#if INCLUDE_SSYGST
void LAPACK(ssygst)(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_DSYGST
void LAPACK(dsygst)(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_CHEGST
void LAPACK(chegst)(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_ZHEGST
void LAPACK(zhegst)(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
////////////
// XTRTRI //
////////////
#if INCLUDE_STRTRI
void LAPACK(strtri)(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_strtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_DTRTRI
void LAPACK(dtrtri)(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_CTRTRI
void LAPACK(ctrtri)(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_ZTRTRI
void LAPACK(ztrtri)(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
}
#endif
////////////
// XPOTRF //
////////////
#if INCLUDE_SPOTRF
void LAPACK(spotrf)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_spotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DPOTRF
void LAPACK(dpotrf)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dpotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CPOTRF
void LAPACK(cpotrf)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_cpotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZPOTRF
void LAPACK(zpotrf)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_zpotrf(uplo, n, A, ldA, info);
}
#endif
////////////
// XPBTRF //
////////////
#if INCLUDE_SPBTRF
void LAPACK(spbtrf)(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_DPBTRF
void LAPACK(dpbtrf)(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_CPBTRF
void LAPACK(cpbtrf)(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_ZPBTRF
void LAPACK(zpbtrf)(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
////////////
// XSYTRF //
////////////
#if INCLUDE_SSYTRF
void LAPACK(ssytrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_DSYTRF
void LAPACK(dsytrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CSYTRF
void LAPACK(csytrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZSYTRF
void LAPACK(zsytrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CHETRF
void LAPACK(chetrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZHETRF
void LAPACK(zhetrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_SSYTRF_ROOK
void LAPACK(ssytrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_DSYTRF_ROOK
void LAPACK(dsytrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CSYTRF_ROOK
void LAPACK(csytrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZSYTRF_ROOK
void LAPACK(zsytrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CHETRF_ROOK
void LAPACK(chetrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZHETRF_ROOK
void LAPACK(zhetrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
////////////
// XGETRF //
////////////
#if INCLUDE_SGETRF
void LAPACK(sgetrf)(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_DGETRF
void LAPACK(dgetrf)(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_CGETRF
void LAPACK(cgetrf)(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_ZGETRF
void LAPACK(zgetrf)(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
}
#endif
////////////
// XGBTRF //
////////////
#if INCLUDE_SGBTRF
void LAPACK(sgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_DGBTRF
void LAPACK(dgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_CGBTRF
void LAPACK(cgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_ZGBTRF
void LAPACK(zgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
////////////
// XTRSYL //
////////////
#if INCLUDE_STRSYL
void LAPACK(strsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_DTRSYL
void LAPACK(dtrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_CTRSYL
void LAPACK(ctrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_ZTRSYL
void LAPACK(ztrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
////////////
// XTGSYL //
////////////
#if INCLUDE_STGSYL
void LAPACK(stgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_DTGSYL
void LAPACK(dtgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_CTGSYL
void LAPACK(ctgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_ZTGSYL
void LAPACK(ztgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
////////////
// XGEMMT //
////////////
#if INCLUDE_SGEMMT
void LAPACK(sgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
RELAPACK_sgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DGEMMT
void LAPACK(dgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
RELAPACK_dgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CGEMMT
void LAPACK(cgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
RELAPACK_cgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZGEMMT
void LAPACK(zgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
RELAPACK_zgemmt(uplo, n, A, ldA, info);
}
#endif

View File

@@ -0,0 +1,607 @@
#include "relapack.h"
////////////
// XLAUUM //
////////////
#if INCLUDE_SLAUUM
void LAPACK(slauum)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_slauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DLAUUM
void LAPACK(dlauum)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dlauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CLAUUM
void LAPACK(clauum)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_clauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZLAUUM
void LAPACK(zlauum)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_zlauum(uplo, n, A, ldA, info);
}
#endif
////////////
// XSYGST //
////////////
#if INCLUDE_SSYGST
void LAPACK(ssygst)(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_DSYGST
void LAPACK(dsygst)(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_CSYGST
void LAPACK(csygst)(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_ZSYGST
void LAPACK(zsygst)(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
////////////
// XTRTRI //
////////////
#if INCLUDE_STRTRI
void LAPACK(strtri)(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_strtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_DTRTRI
void LAPACK(dtrtri)(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_CTRTRI
void LAPACK(ctrtri)(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_ZTRTRI
void LAPACK(ztrtri)(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
}
#endif
////////////
// XPOTRF //
////////////
#if INCLUDE_SPOTRF
void LAPACK(spotrf)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_spotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DPOTRF
void LAPACK(dpotrf)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dpotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CPOTRF
void LAPACK(cpotrf)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_cpotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZPOTRF
void LAPACK(zpotrf)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_zpotrf(uplo, n, A, ldA, info);
}
#endif
////////////
// XPBTRF //
////////////
#if INCLUDE_SPBTRF
void LAPACK(spbtrf)(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_DPBTRF
void LAPACK(dpbtrf)(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_CPBTRF
void LAPACK(cpbtrf)(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_ZPBTRF
void LAPACK(zpbtrf)(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
////////////
// XSYTRF //
////////////
#if INCLUDE_SSYTRF
void LAPACK(ssytrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_DSYTRF
void LAPACK(dsytrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CSYTRF
void LAPACK(csytrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZSYTRF
void LAPACK(zsytrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CHETRF
void LAPACK(chetrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZHETRF
void LAPACK(zhetrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_SSYTRF_ROOK
void LAPACK(ssytrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_DSYTRF_ROOK
void LAPACK(dsytrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CSYTRF_ROOK
void LAPACK(csytrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZSYTRF_ROOK
void LAPACK(zsytrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CHETRF_ROOK
void LAPACK(chetrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZHETRF_ROOK
void LAPACK(zhetrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
////////////
// XGETRF //
////////////
#if INCLUDE_SGETRF
void LAPACK(sgetrf)(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_DGETRF
void LAPACK(dgetrf)(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_CGETRF
void LAPACK(cgetrf)(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_ZGETRF
void LAPACK(zgetrf)(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
}
#endif
////////////
// XGBTRF //
////////////
#if INCLUDE_SGBTRF
void LAPACK(sgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_DGBTRF
void LAPACK(dgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_CGBTRF
void LAPACK(cgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_ZGBTRF
void LAPACK(zgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
////////////
// XTRSYL //
////////////
#if INCLUDE_STRSYL
void LAPACK(strsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_DTRSYL
void LAPACK(dtrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_CTRSYL
void LAPACK(ctrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_ZTRSYL
void LAPACK(ztrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
////////////
// XTGSYL //
////////////
#if INCLUDE_STGSYL
void LAPACK(stgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_DTGSYL
void LAPACK(dtgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_CTGSYL
void LAPACK(ctgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_ZTGSYL
void LAPACK(ztgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
////////////
// XGEMMT //
////////////
#if INCLUDE_SGEMMT
void LAPACK(sgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
RELAPACK_sgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DGEMMT
void LAPACK(dgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
RELAPACK_dgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CGEMMT
void LAPACK(cgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
RELAPACK_cgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZGEMMT
void LAPACK(zgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
RELAPACK_zgemmt(uplo, n, A, ldA, info);
}
#endif

60
relapack/src/relapack.h Normal file
View File

@@ -0,0 +1,60 @@
#ifndef RELAPACK_INT_H
#define RELAPACK_INT_H
#include "../config.h"
#include "../inc/relapack.h"
// add an underscore to BLAS routines (or not)
#if BLAS_UNDERSCORE
#define BLAS(routine) routine ## _
#else
#define BLAS(routine) routine
#endif
// add an underscore to LAPACK routines (or not)
#if LAPACK_UNDERSCORE
#define LAPACK(routine) routine ## _
#else
#define LAPACK(routine) routine
#endif
// minimum and maximum macros
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
// REC_SPLIT(n) returns how a problem of size n is split recursively.
// If n >= 16, we ensure that the size of at least one of the halves is
// divisible by 8 (the cache line size in most CPUs), while both halves are
// still as close as possible in size.
// If n < 16 the problem is simply split in the middle. (Note that the
// crossoversize is usually larger than 16.)
#define SREC_SPLIT(n) ((n >= 32) ? ((n + 16) / 32) * 16 : n / 2)
#define DREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2)
#define CREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2)
#define ZREC_SPLIT(n) ((n >= 8) ? ((n + 4) / 8) * 4 : n / 2)
#include "lapack.h"
#include "blas.h"
// sytrf helper routines
void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
// trsyl helper routines
void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
#endif /* RELAPACK_INT_H */

227
relapack/src/sgbtrf.c Normal file
View File

@@ -0,0 +1,227 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *,
const int *, float *, const int *, int *, float *, const int *, float *,
const int *, int *);
/** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's sgbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html
* */
void RELAPACK_sgbtrf(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kl < 0)
*info = -3;
else if (*ku < 0)
*info = -4;
else if (*ldAb < 2 * *kl + *ku + 1)
*info = -6;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SGBTRF", &minfo);
return;
}
// Constant
const float ZERO[] = { 0. };
// Result upper band width
const int kv = *ku + *kl;
// Unskewg A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + kv;
// Zero upper diagonal fill-in elements
int i, j;
for (j = 0; j < *n; j++) {
float *const A_j = A + *ldA * j;
for (i = MAX(0, j - kv); i < j - *ku; i++)
A_j[i] = 0.;
}
// Allocate work space
const int n1 = SREC_SPLIT(*n);
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const int nWorkl = (kv > n1) ? n1 : kv;
const int mWorku = (*kl > n1) ? n1 : *kl;
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
float *Workl = malloc(mWorkl * nWorkl * sizeof(float));
float *Worku = malloc(mWorku * nWorku * sizeof(float));
LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
// Free work space
free(Workl);
free(Worku);
}
/** sgbtrf's recursive compute kernel */
static void RELAPACK_sgbtrf_rec(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
int *info
) {
if (*n <= MAX(CROSSOVER_SGBTRF, 1)) {
// Unblocked
LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const int iONE[] = { 1 };
// Loop iterators
int i, j;
// Output upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + kv;
// Splitting
const int n1 = MIN(SREC_SPLIT(*n), *kl);
const int n2 = *n - n1;
const int m1 = MIN(n1, *m);
const int m2 = *m - m1;
const int mn1 = MIN(m1, n1);
const int mn2 = MIN(m2, n2);
// Ab_L *
// Ab_BR
float *const Ab_L = Ab;
float *const Ab_BR = Ab + *ldAb * n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + *ldA * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + m1;
float *const A_BR = A + *ldA * n1 + m1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// Banded splitting
const int n21 = MIN(n2, kv - n1);
const int n22 = MIN(n2 - n21, n1);
const int m21 = MIN(m2, *kl - m1);
const int m22 = MIN(m2 - m21, m1);
// n1 n21 n22
// m * A_Rl ARr
float *const A_Rl = A_R;
float *const A_Rr = A_R + *ldA * n21;
// n1 n21 n22
// m1 * A_TRl A_TRr
// m21 A_BLt A_BRtl A_BRtr
// m22 A_BLb A_BRbl A_BRbr
float *const A_TRl = A_TR;
float *const A_TRr = A_TR + *ldA * n21;
float *const A_BLt = A_BL;
float *const A_BLb = A_BL + m21;
float *const A_BRtl = A_BR;
float *const A_BRtr = A_BR + *ldA * n21;
float *const A_BRbl = A_BR + m21;
float *const A_BRbr = A_BR + *ldA * n21 + m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
// Workl = A_BLb
LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
// partially redo swaps in A_L
for (i = 0; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
else
BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
}
}
// apply pivots to A_Rl
LAPACK(slaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
// apply pivots to A_Rr columnwise
for (j = 0; j < n22; j++) {
float *const A_Rrj = A_Rr + *ldA * j;
for (i = j; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
const float tmp = A_Rrj[i];
A_Rrj[i] = A_Rr[ip];
A_Rrj[ip] = tmp;
}
}
}
// A_TRl = A_TL \ A_TRl
BLAS(strsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// Worku = A_TRr
LAPACK(slacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
// Worku = A_TL \ Worku
BLAS(strsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
// A_TRr = Worku
LAPACK(slacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
// A_BRtl = A_BRtl - A_BLt * A_TRl
BLAS(sgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
// A_BRbl = A_BRbl - Workl * A_TRl
BLAS(sgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
// A_BRtr = A_BRtr - A_BLt * Worku
BLAS(sgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Workl * Worku
BLAS(sgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
// partially undo swaps in A_L
for (i = mn1 - 1; i >= 0; i--) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
else
BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
if (*info)
*info += n1;
// shift pivots
for (i = 0; i < mn2; i++)
ipiv_B[i] += n1;
}

165
relapack/src/sgemmt.c Normal file
View File

@@ -0,0 +1,165 @@
#include "relapack.h"
static void RELAPACK_sgemmt_rec(const char *, const char *, const char *,
const int *, const int *, const float *, const float *, const int *,
const float *, const int *, const float *, float *, const int *);
static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *,
const int *, const int *, const float *, const float *, const int *,
const float *, const int *, const float *, float *, const int *);
/** SGEMMT computes a matrix-matrix product with general matrices but updates
* only the upper or lower triangular part of the result matrix.
*
* This routine performs the same operation as the BLAS routine
* sgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
* but only updates the triangular part of C specified by uplo:
* If (*uplo == 'L'), only the lower triangular part of C is updated,
* otherwise the upper triangular part is updated.
* */
void RELAPACK_sgemmt(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
#if HAVE_XGEMMT
BLAS(sgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
#else
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int notransA = LAPACK(lsame)(transA, "N");
const int tranA = LAPACK(lsame)(transA, "T");
const int notransB = LAPACK(lsame)(transB, "N");
const int tranB = LAPACK(lsame)(transB, "T");
int info = 0;
if (!lower && !upper)
info = 1;
else if (!tranA && !notransA)
info = 2;
else if (!tranB && !notransB)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*ldA < MAX(1, notransA ? *n : *k))
info = 8;
else if (*ldB < MAX(1, notransB ? *k : *n))
info = 10;
else if (*ldC < MAX(1, *n))
info = 13;
if (info) {
LAPACK(xerbla)("SGEMMT", &info);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleantransA = notransA ? 'N' : 'T';
const char cleantransB = notransB ? 'N' : 'T';
// Recursive kernel
RELAPACK_sgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
#endif
}
/** sgemmt's recursive compute kernel */
static void RELAPACK_sgemmt_rec(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
if (*n <= MAX(CROSSOVER_SGEMMT, 1)) {
// Unblocked
RELAPACK_sgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
}
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
// A_T
// A_B
const float *const A_T = A;
const float *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);
// B_L B_R
const float *const B_L = B;
const float *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);
// C_TL C_TR
// C_BL C_BR
float *const C_TL = C;
float *const C_TR = C + *ldC * n1;
float *const C_BL = C + n1;
float *const C_BR = C + *ldC * n1 + n1;
// recursion(C_TL)
RELAPACK_sgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
if (*uplo == 'L')
// C_BL = alpha A_B B_L + beta C_BL
BLAS(sgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
else
// C_TR = alpha A_T B_R + beta C_TR
BLAS(sgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
// recursion(C_BR)
RELAPACK_sgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
}
/** sgemmt's unblocked compute kernel */
static void RELAPACK_sgemmt_rec2(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
const int incB = (*transB == 'N') ? 1 : *ldB;
const int incC = 1;
int i;
for (i = 0; i < *n; i++) {
// A_0
// A_i
const float *const A_0 = A;
const float *const A_i = A + ((*transA == 'N') ? i : *ldA * i);
// * B_i *
const float *const B_i = B + ((*transB == 'N') ? *ldB * i : i);
// * C_0i *
// * C_ii *
float *const C_0i = C + *ldC * i;
float *const C_ii = C + *ldC * i + i;
if (*uplo == 'L') {
const int nmi = *n - i;
if (*transA == 'N')
BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
else
BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
} else {
const int ip1 = i + 1;
if (*transA == 'N')
BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
else
BLAS(sgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
}
}
}

117
relapack/src/sgetrf.c Normal file
View File

@@ -0,0 +1,117 @@
#include "relapack.h"
static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *,
int *, int *);
/** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's sgetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html
* */
void RELAPACK_sgetrf(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SGETRF", &minfo);
return;
}
const int sn = MIN(*m, *n);
RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
if (*m < *n) {
// Constants
const float ONE[] = { 1. };
const int iONE[] = { 1. };
// Splitting
const int rn = *n - *m;
// A_L A_R
const float *const A_L = A;
float *const A_R = A + *ldA * *m;
// A_R = apply(ipiv, A_R)
LAPACK(slaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
// A_R = A_L \ A_R
BLAS(strsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
}
}
/** sgetrf's recursive compute kernel */
static void RELAPACK_sgetrf_rec(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
if (*n <= MAX(CROSSOVER_SGETRF, 1)) {
// Unblocked
LAPACK(sgetf2)(m, n, A, ldA, ipiv, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const int iONE[] = { 1 };
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
const int m2 = *m - n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + *ldA * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// recursion(A_L, ipiv_T)
RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
// apply pivots to A_R
LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
// A_TR = A_TL \ A_TR
BLAS(strsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_BL * A_TR
BLAS(sgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
// recursion(A_BR, ipiv_B)
RELAPACK_sgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
if (*info)
*info += n1;
// apply pivots to A_BL
LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
// shift pivots
int i;
for (i = 0; i < n2; i++)
ipiv_B[i] += n1;
}

87
relapack/src/slauum.c Normal file
View File

@@ -0,0 +1,87 @@
#include "relapack.h"
static void RELAPACK_slauum_rec(const char *, const int *, float *,
const int *, int *);
/** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
*
* This routine is functionally equivalent to LAPACK's slauum.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html
* */
void RELAPACK_slauum(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SLAUUM", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_slauum_rec(&cleanuplo, n, A, ldA, info);
}
/** slauum's recursive compute kernel */
static void RELAPACK_slauum_rec(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_SLAUUM, 1)) {
// Unblocked
LAPACK(slauu2)(uplo, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1. };
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_slauum_rec(uplo, &n1, A_TL, ldA, info);
if (*uplo == 'L') {
// A_TL = A_TL + A_BL' * A_BL
BLAS(ssyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
// A_BL = A_BR' * A_BL
BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TL = A_TL + A_TR * A_TR'
BLAS(ssyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
// A_TR = A_TR * A_BR'
BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_slauum_rec(uplo, &n2, A_BR, ldA, info);
}

157
relapack/src/spbtrf.c Normal file
View File

@@ -0,0 +1,157 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_spbtrf_rec(const char *, const int *, const int *,
float *, const int *, float *, const int *, int *);
/** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
*
* This routine is functionally equivalent to LAPACK's spbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html
* */
void RELAPACK_spbtrf(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kd < 0)
*info = -3;
else if (*ldAb < *kd + 1)
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SPBTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Constant
const float ZERO[] = { 0. };
// Allocate work space
const int n1 = SREC_SPLIT(*n);
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
float *Work = malloc(mWork * nWork * sizeof(float));
LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
// Recursive kernel
RELAPACK_spbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
// Free work space
free(Work);
}
/** spbtrf's recursive compute kernel */
static void RELAPACK_spbtrf_rec(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
float *Work, const int *ldWork,
int *info
){
if (*n <= MAX(CROSSOVER_SPBTRF, 1)) {
// Unblocked
LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
// Splitting
const int n1 = MIN(SREC_SPLIT(*n), *kd);
const int n2 = *n - n1;
// * *
// * Ab_BR
float *const Ab_BR = Ab + *ldAb * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_spotrf(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
// Banded splitting
const int n21 = MIN(n2, *kd - n1);
const int n22 = MIN(n2 - n21, *kd);
// n1 n21 n22
// n1 * A_TRl A_TRr
// n21 A_BLt A_BRtl A_BRtr
// n22 A_BLb A_BRbl A_BRbr
float *const A_TRl = A_TR;
float *const A_TRr = A_TR + *ldA * n21;
float *const A_BLt = A_BL;
float *const A_BLb = A_BL + n21;
float *const A_BRtl = A_BR;
float *const A_BRtr = A_BR + *ldA * n21;
float *const A_BRbl = A_BR + n21;
float *const A_BRbr = A_BR + *ldA * n21 + n21;
if (*uplo == 'L') {
// A_BLt = ABLt / A_TL'
BLAS(strsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
// A_BRtl = A_BRtl - A_BLt * A_BLt'
BLAS(ssyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
// Work = A_BLb
LAPACK(slacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
// Work = Work / A_TL'
BLAS(strsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
// A_BRbl = A_BRbl - Work * A_BLt'
BLAS(sgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
// A_BRbr = A_BRbr - Work * Work'
BLAS(ssyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_BLb = Work
LAPACK(slacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
} else {
// A_TRl = A_TL' \ A_TRl
BLAS(strsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// A_BRtl = A_BRtl - A_TRl' * A_TRl
BLAS(ssyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
// Work = A_TRr
LAPACK(slacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
// Work = A_TL' \ Work
BLAS(strsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
// A_BRtr = A_BRtr - A_TRl' * Work
BLAS(sgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Work' * Work
BLAS(ssyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_TRr = Work
LAPACK(slacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
}
// recursion(A_BR)
if (*kd > n1)
RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info);
else
RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
if (*info)
*info += n1;
}

92
relapack/src/spotrf.c Normal file
View File

@@ -0,0 +1,92 @@
#include "relapack.h"
static void RELAPACK_spotrf_rec(const char *, const int *, float *,
const int *, int *);
/** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
*
* This routine is functionally equivalent to LAPACK's spotrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html
* */
void RELAPACK_spotrf(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SPOTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_spotrf_rec(&cleanuplo, n, A, ldA, info);
}
/** spotrf's recursive compute kernel */
static void RELAPACK_spotrf_rec(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_SPOTRF, 1)) {
// Unblocked
LAPACK(spotf2)(uplo, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_spotrf_rec(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = A_BL / A_TL'
BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
// A_BR = A_BR - A_BL * A_BL'
BLAS(ssyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
} else {
// A_TR = A_TL' \ A_TR
BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_TR' * A_TR
BLAS(ssyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
}
// recursion(A_BR)
RELAPACK_spotrf_rec(uplo, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

212
relapack/src/ssygst.c Normal file
View File

@@ -0,0 +1,212 @@
#include "relapack.h"
#if XSYGST_ALLOW_MALLOC
#include "stdlib.h"
#endif
static void RELAPACK_ssygst_rec(const int *, const char *, const int *,
float *, const int *, const float *, const int *,
float *, const int *, int *);
/** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
*
* This routine is functionally equivalent to LAPACK's ssygst.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html
* */
void RELAPACK_ssygst(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (*itype < 1 || *itype > 3)
*info = -1;
else if (!lower && !upper)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
else if (*ldB < MAX(1, *n))
*info = -7;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SSYGST", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Allocate work space
float *Work = NULL;
int lWork = 0;
#if XSYGST_ALLOW_MALLOC
const int n1 = SREC_SPLIT(*n);
lWork = n1 * (*n - n1);
Work = malloc(lWork * sizeof(float));
if (!Work)
lWork = 0;
#endif
// Recursive kernel
RELAPACK_ssygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
// Free work space
#if XSYGST_ALLOW_MALLOC
if (Work)
free(Work);
#endif
}
/** ssygst's recursive compute kernel */
static void RELAPACK_ssygst_rec(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
float *Work, const int *lWork, int *info
) {
if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
// Unblocked
LAPACK(ssygs2)(itype, uplo, n, A, ldA, B, ldB, info);
return;
}
// Constants
const float ZERO[] = { 0. };
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const float HALF[] = { .5 };
const float MHALF[] = { -.5 };
const int iONE[] = { 1 };
// Loop iterator
int i;
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// B_TL B_TR
// B_BL B_BR
const float *const B_TL = B;
const float *const B_TR = B + *ldB * n1;
const float *const B_BL = B + n1;
const float *const B_BR = B + *ldB * n1 + n1;
// recursion(A_TL, B_TL)
RELAPACK_ssygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
if (*itype == 1)
if (*uplo == 'L') {
// A_BL = A_BL / B_TL'
BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * B_BL * A_TL
BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
} else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
BLAS(ssyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR \ A_BL
BLAS(strsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL' \ A_TR
BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * A_TL * B_TR
BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
} else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
BLAS(ssyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR / B_BR
BLAS(strsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
else
if (*uplo == 'L') {
// A_BL = A_BL * B_TL
BLAS(strmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * A_BR * B_BL
BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
} else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
BLAS(ssyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR * A_BL
BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL * A_TR
BLAS(strmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * B_TR * A_BR
BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
} else
// A_TR = A_TR + 1/2 B_TR A_BR
BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
BLAS(ssyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
else
// A_TR = A_TR + 1/2 B_TR * A_BR
BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR * B_BR
BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
// recursion(A_BR, B_BR)
RELAPACK_ssygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
}

238
relapack/src/ssytrf.c Normal file
View File

@@ -0,0 +1,238 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's ssytrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html
* */
void RELAPACK_ssytrf(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy arguments
int nout;
// Recursive kernel
RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** ssytrf's recursive compute kernel */
static void RELAPACK_ssytrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_SSYTRF, 3)) {
// Unblocked
if (top) {
LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_ssytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const int iONE[] = { 1 };
// Loop iterator
int i;
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = SREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
float *const A_BL_B = A + *n;
float *const A_BR_B = A + *ldA * n1 + *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + n1;
float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = SREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + *ldA * n_rest;
float *const A_TR_T = A + *ldA * (n_rest + n1);
float *const A_TL = A + *ldA * n_rest + n_rest;
float *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

351
relapack/src/ssytrf_rec2.c Normal file
View File

@@ -0,0 +1,351 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static int c__1 = 1;
static float c_b8 = -1.f;
static float c_b9 = 1.f;
/** SSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
*
* This routine is a minor modification of LAPACK's slasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int *
nb, int *kb, float *a, int *lda, int *ipiv, float *w,
int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
float r__1, r__2, r__3;
/* Builtin functions */
double sqrt(double);
/* Local variables */
static int j, k;
static float t, r1, d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static float alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int sscal_(int *, float *, float *, int *),
sgemv_(char *, int *, int *, float *, float *, int *,
float *, int *, float *, float *, int *, ftnlen);
static int kstep;
extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
int *), sswap_(int *, float *, int *, float *, int *
);
static float absakk;
extern int isamax_(int *, float *, int *);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
kstep = 1;
absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1));
if (k > 1) {
i__1 = k - 1;
imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
i__1 = k - imax;
jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1));
if (imax > 1) {
i__1 = imax - 1;
jmax = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
r__2 = rowmax, r__3 = (r__1 = w[jmax + (kw - 1) * w_dim1],
dabs(r__1));
rowmax = dmax(r__2,r__3);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else if ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) >=
alpha * rowmax) {
kp = imax;
scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
i__1 = kk - 1 - kp;
scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
scopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
sswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
r1 = 1.f / a[k + k * a_dim1];
i__1 = k - 1;
sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
if (k > 2) {
d21 = w[k - 1 + kw * w_dim1];
d11 = w[k + kw * w_dim1] / d21;
d22 = w[k - 1 + (kw - 1) * w_dim1] / d21;
t = 1.f / (d11 * d22 - 1.f);
d21 = t / d21;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1)
* w_dim1] - w[j + kw * w_dim1]);
a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] -
w[j + (kw - 1) * w_dim1]);
/* L20: */
}
}
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
a[k + k * a_dim1] = w[k + kw * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
sswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
i__1 = *n - k + 1;
scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12);
kstep = 1;
absakk = (r__1 = w[k + k * w_dim1], dabs(r__1));
if (k < *n) {
i__1 = *n - k;
imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax - k;
jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
r__2 = rowmax, r__3 = (r__1 = w[jmax + (k + 1) * w_dim1],
dabs(r__1));
rowmax = dmax(r__2,r__3);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else if ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) >=
alpha * rowmax) {
kp = imax;
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
kk = k + kstep - 1;
if (kp != kk) {
a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
i__1 = kp - kk - 1;
scopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
sswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
r1 = 1.f / a[k + k * a_dim1];
i__1 = *n - k;
sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
}
} else {
if (k < *n - 1) {
d21 = w[k + 1 + k * w_dim1];
d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
d22 = w[k + k * w_dim1] / d21;
t = 1.f / (d11 * d22 - 1.f);
d21 = t / d21;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] -
w[j + (k + 1) * w_dim1]);
a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) *
w_dim1] - w[j + k * w_dim1]);
/* L80: */
}
}
a[k + k * a_dim1] = w[k + k * w_dim1];
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
sswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/ssytrf_rook.c Normal file
View File

@@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's ssytrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html
* */
void RELAPACK_ssytrf_rook(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** ssytrf_rook's recursive compute kernel */
static void RELAPACK_ssytrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_ssytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = SREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
float *const A_BL_B = A + *n;
float *const A_BR_B = A + *ldA * n1 + *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + n1;
float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = SREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + *ldA * n_rest;
float *const A_TR_T = A + *ldA * (n_rest + n1);
float *const A_TL = A + *ldA * n_rest + n_rest;
float *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@@ -0,0 +1,451 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static int c__1 = 1;
static float c_b9 = -1.f;
static float c_b10 = 1.f;
/** SSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method.
*
* This routine is a minor modification of LAPACK's slasyf_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, float *a, int *lda, int *ipiv, float *
w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
float r__1;
/* Builtin functions */
double sqrt(double);
/* Local variables */
static int j, k, p;
static float t, r1, d11, d12, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static float alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int sscal_(int *, float *, float *, int *);
static float sfmin;
static int itemp;
extern /* Subroutine */ int sgemv_(char *, int *, int *, float *,
float *, int *, float *, int *, float *, float *, int *,
ftnlen);
static int kstep;
static float stemp;
extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
int *), sswap_(int *, float *, int *, float *, int *
);
static float absakk;
extern double slamch_(char *, ftnlen);
extern int isamax_(int *, float *, int *);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
sfmin = slamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1));
if (k > 1) {
i__1 = k - 1;
imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1));
} else {
rowmax = 0.f;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
stemp = (r__1 = w[itemp + (kw - 1) * w_dim1], dabs(r__1));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
if (! ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) <
alpha * rowmax)) {
kp = imax;
scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = k - p;
scopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
scopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
c__1);
i__1 = *n - k + 1;
sswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
lda);
i__1 = *n - kk + 1;
sswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
a[kp + k * a_dim1] = a[kk + k * a_dim1];
i__1 = k - 1 - kp;
scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
scopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
c__1);
i__1 = *n - kk + 1;
sswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
lda);
i__1 = *n - kk + 1;
sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) {
r1 = 1.f / a[k + k * a_dim1];
i__1 = k - 1;
sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else if (a[k + k * a_dim1] != 0.f) {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
a[ii + k * a_dim1] /= a[k + k * a_dim1];
/* L14: */
}
}
}
} else {
if (k > 2) {
d12 = w[k - 1 + kw * w_dim1];
d11 = w[k + kw * w_dim1] / d12;
d22 = w[k - 1 + (kw - 1) * w_dim1] / d12;
t = 1.f / (d11 * d22 - 1.f);
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) *
w_dim1] - w[j + kw * w_dim1]) / d12);
a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] -
w[j + (kw - 1) * w_dim1]) / d12);
/* L20: */
}
}
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
a[k + k * a_dim1] = w[k + kw * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
sswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
jj = j - 1;
if (jp1 != jj && kstep == 2) {
i__1 = *n - j + 1;
sswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = *n - k + 1;
scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
}
absakk = (r__1 = w[k + k * w_dim1], dabs(r__1));
if (k < *n) {
i__1 = *n - k;
imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k
+ 1) * w_dim1], &c__1, (ftnlen)12);
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1));
} else {
rowmax = 0.f;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
stemp = (r__1 = w[itemp + (k + 1) * w_dim1], dabs(r__1));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
if (! ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) <
alpha * rowmax)) {
kp = imax;
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p - k;
scopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
lda);
i__1 = *n - p + 1;
scopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
c__1);
sswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
sswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
a[kp + k * a_dim1] = a[kk + k * a_dim1];
i__1 = kp - k - 1;
scopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
* a_dim1], lda);
i__1 = *n - kp + 1;
scopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
a_dim1], &c__1);
sswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) {
r1 = 1.f / a[k + k * a_dim1];
i__1 = *n - k;
sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else if (a[k + k * a_dim1] != 0.f) {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
a[ii + k * a_dim1] /= a[k + k * a_dim1];
/* L74: */
}
}
}
} else {
if (k < *n - 1) {
d21 = w[k + 1 + k * w_dim1];
d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
d22 = w[k + k * w_dim1] / d21;
t = 1.f / (d11 * d22 - 1.f);
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[
j + (k + 1) * w_dim1]) / d21);
a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) *
w_dim1] - w[j + k * w_dim1]) / d21);
/* L80: */
}
}
a[k + k * a_dim1] = w[k + k * w_dim1];
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
sswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
jj = j + 1;
if (jp1 != jj && kstep == 2) {
sswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

274
relapack/src/stgsyl.c Normal file
View File

@@ -0,0 +1,274 @@
#include "relapack.h"
#include <math.h>
static void RELAPACK_stgsyl_rec(const char *, const int *, const int *,
const int *, const float *, const int *, const float *, const int *,
float *, const int *, const float *, const int *, const float *,
const int *, float *, const int *, float *, float *, float *, int *, int *,
int *);
/** STGSYL solves the generalized Sylvester equation.
*
* This routine is functionally equivalent to LAPACK's stgsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html
* */
void RELAPACK_stgsyl(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
// Parse arguments
const int notran = LAPACK(lsame)(trans, "N");
const int tran = LAPACK(lsame)(trans, "T");
// Compute work buffer size
int lwmin = 1;
if (notran && (*ijob == 1 || *ijob == 2))
lwmin = MAX(1, 2 * *m * *n);
*info = 0;
// Check arguments
if (!tran && !notran)
*info = -1;
else if (notran && (*ijob < 0 || *ijob > 4))
*info = -2;
else if (*m <= 0)
*info = -3;
else if (*n <= 0)
*info = -4;
else if (*ldA < MAX(1, *m))
*info = -6;
else if (*ldB < MAX(1, *n))
*info = -8;
else if (*ldC < MAX(1, *m))
*info = -10;
else if (*ldD < MAX(1, *m))
*info = -12;
else if (*ldE < MAX(1, *n))
*info = -14;
else if (*ldF < MAX(1, *m))
*info = -16;
else if (*lWork < lwmin && *lWork != -1)
*info = -20;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("STGSYL", &minfo);
return;
}
if (*lWork == -1) {
// Work size query
*Work = lwmin;
return;
}
// Clean char * arguments
const char cleantrans = notran ? 'N' : 'T';
// Constant
const float ZERO[] = { 0. };
int isolve = 1;
int ifunc = 0;
if (notran) {
if (*ijob >= 3) {
ifunc = *ijob - 2;
LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF);
} else if (*ijob >= 1)
isolve = 2;
}
float scale2;
int iround;
for (iround = 1; iround <= isolve; iround++) {
*scale = 1;
float dscale = 0;
float dsum = 1;
int pq;
RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
if (dscale != 0) {
if (*ijob == 1 || *ijob == 3)
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
else
*dif = sqrt(pq) / (dscale * sqrt(dsum));
}
if (isolve == 2) {
if (iround == 1) {
if (notran)
ifunc = *ijob;
scale2 = *scale;
LAPACK(slacpy)("F", m, n, C, ldC, Work, m);
LAPACK(slacpy)("F", m, n, F, ldF, Work + *m * *n, m);
LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF);
} else {
LAPACK(slacpy)("F", m, n, Work, m, C, ldC);
LAPACK(slacpy)("F", m, n, Work + *m * *n, m, F, ldF);
*scale = scale2;
}
}
}
}
/** stgsyl's recursive vompute kernel */
static void RELAPACK_stgsyl_rec(
const char *trans, const int *ifunc, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dsum, float *dscale,
int *iWork, int *pq, int *info
) {
if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) {
// Unblocked
LAPACK(stgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const int iONE[] = { 1 };
// Outputs
float scale1[] = { 1. };
float scale2[] = { 1. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
int m1 = SREC_SPLIT(*m);
if (A[m1 + *ldA * (m1 - 1)])
m1++;
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const float *const A_TL = A;
const float *const A_TR = A + *ldA * m1;
const float *const A_BR = A + *ldA * m1 + m1;
// C_T
// C_B
float *const C_T = C;
float *const C_B = C + m1;
// D_TL D_TR
// 0 D_BR
const float *const D_TL = D;
const float *const D_TR = D + *ldD * m1;
const float *const D_BR = D + *ldD * m1 + m1;
// F_T
// F_B
float *const F_T = F;
float *const F_B = F + m1;
if (*trans == 'N') {
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1);
// C_T = C_T - A_TR * C_B
BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// F_T = F_T - D_TR * C_B
BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
}
} else {
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
// C_B = C_B - A_TR^H * C_T
BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// C_B = C_B - D_TR^H * F_T
BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
}
}
} else {
// Splitting
int n1 = SREC_SPLIT(*n);
if (B[n1 + *ldB * (n1 - 1)])
n1++;
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const float *const B_TL = B;
const float *const B_TR = B + *ldB * n1;
const float *const B_BR = B + *ldB * n1 + n1;
// C_L C_R
float *const C_L = C;
float *const C_R = C + *ldC * n1;
// E_TL E_TR
// 0 E_BR
const float *const E_TL = E;
const float *const E_TR = E + *ldE * n1;
const float *const E_BR = E + *ldE * n1 + n1;
// F_L F_R
float *const F_L = F;
float *const F_R = F + *ldF * n1;
if (*trans == 'N') {
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1);
// C_R = C_R + F_L * B_TR
BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
// F_R = F_R + F_L * E_TR
BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
}
} else {
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
// F_L = F_L + C_R * B_TR
BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
// F_L = F_L + F_R * E_TR
BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
}
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

169
relapack/src/strsyl.c Normal file
View File

@@ -0,0 +1,169 @@
#include "relapack.h"
static void RELAPACK_strsyl_rec(const char *, const char *, const int *,
const int *, const int *, const float *, const int *, const float *,
const int *, float *, const int *, float *, int *);
/** STRSYL solves the real Sylvester matrix equation.
*
* This routine is functionally equivalent to LAPACK's strsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html
* */
void RELAPACK_strsyl(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
// Check arguments
const int notransA = LAPACK(lsame)(tranA, "N");
const int transA = LAPACK(lsame)(tranA, "T");
const int ctransA = LAPACK(lsame)(tranA, "C");
const int notransB = LAPACK(lsame)(tranB, "N");
const int transB = LAPACK(lsame)(tranB, "T");
const int ctransB = LAPACK(lsame)(tranB, "C");
*info = 0;
if (!transA && !ctransA && !notransA)
*info = -1;
else if (!transB && !ctransB && !notransB)
*info = -2;
else if (*isgn != 1 && *isgn != -1)
*info = -3;
else if (*m < 0)
*info = -4;
else if (*n < 0)
*info = -5;
else if (*ldA < MAX(1, *m))
*info = -7;
else if (*ldB < MAX(1, *n))
*info = -9;
else if (*ldC < MAX(1, *m))
*info = -11;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("STRSYL", &minfo);
return;
}
// Clean char * arguments
const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C');
const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C');
// Recursive kernel
RELAPACK_strsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
/** strsyl's recursive compute kernel */
static void RELAPACK_strsyl_rec(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) {
// Unblocked
RELAPACK_strsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const float MSGN[] = { -*isgn };
const int iONE[] = { 1 };
// Outputs
float scale1[] = { 1. };
float scale2[] = { 1. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
int m1 = SREC_SPLIT(*m);
if (A[m1 + *ldA * (m1 - 1)])
m1++;
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const float *const A_TL = A;
const float *const A_TR = A + *ldA * m1;
const float *const A_BR = A + *ldA * m1 + m1;
// C_T
// C_B
float *const C_T = C;
float *const C_B = C + m1;
if (*tranA == 'N') {
// recusion(A_BR, B, C_B)
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
// C_T = C_T - A_TR * C_B
BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// recusion(A_TL, B, C_T)
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
} else {
// recusion(A_TL, B, C_T)
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
// C_B = C_B - A_TR' * C_T
BLAS(sgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// recusion(A_BR, B, C_B)
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
}
} else {
// Splitting
int n1 = SREC_SPLIT(*n);
if (B[n1 + *ldB * (n1 - 1)])
n1++;
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const float *const B_TL = B;
const float *const B_TR = B + *ldB * n1;
const float *const B_BR = B + *ldB * n1 + n1;
// C_L C_R
float *const C_L = C;
float *const C_R = C + *ldC * n1;
if (*tranB == 'N') {
// recusion(A, B_TL, C_L)
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
// C_R = C_R -/+ C_L * B_TR
BLAS(sgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
// recusion(A, B_BR, C_R)
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
} else {
// recusion(A, B_BR, C_R)
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
// C_L = C_L -/+ C_R * B_TR'
BLAS(sgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
// recusion(A, B_TL, C_L)
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

1029
relapack/src/strsyl_rec2.c Normal file

File diff suppressed because it is too large Load Diff

107
relapack/src/strtri.c Normal file
View File

@@ -0,0 +1,107 @@
#include "relapack.h"
static void RELAPACK_strtri_rec(const char *, const char *, const int *,
float *, const int *, int *);
/** CTRTRI computes the inverse of a real upper or lower triangular matrix A.
*
* This routine is functionally equivalent to LAPACK's strtri.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html
* */
void RELAPACK_strtri(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int nounit = LAPACK(lsame)(diag, "N");
const int unit = LAPACK(lsame)(diag, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (!nounit && !unit)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("STRTRI", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleandiag = nounit ? 'N' : 'U';
// check for singularity
if (nounit) {
int i;
for (i = 0; i < *n; i++)
if (A[i + *ldA * i] == 0) {
*info = i;
return;
}
}
// Recursive kernel
RELAPACK_strtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
}
/** strtri's recursive compute kernel */
static void RELAPACK_strtri_rec(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_STRTRI, 1)) {
// Unblocked
LAPACK(strti2)(uplo, diag, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_strtri_rec(uplo, diag, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = - A_BL * A_TL
BLAS(strmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
// A_BL = A_BR \ A_BL
BLAS(strsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TR = - A_TL * A_TR
BLAS(strmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
// A_TR = A_TR / A_BR
BLAS(strsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_strtri_rec(uplo, diag, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

230
relapack/src/zgbtrf.c Normal file
View File

@@ -0,0 +1,230 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *,
const int *, double *, const int *, int *, double *, const int *, double *,
const int *, int *);
/** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's zgbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html
* */
void RELAPACK_zgbtrf(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kl < 0)
*info = -3;
else if (*ku < 0)
*info = -4;
else if (*ldAb < 2 * *kl + *ku + 1)
*info = -6;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZGBTRF", &minfo);
return;
}
// Constant
const double ZERO[] = { 0., 0. };
// Result upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + 2 * kv;
// Zero upper diagonal fill-in elements
int i, j;
for (j = 0; j < *n; j++) {
double *const A_j = A + 2 * *ldA * j;
for (i = MAX(0, j - kv); i < j - *ku; i++)
A_j[2 * i] = A_j[2 * i + 1] = 0.;
}
// Allocate work space
const int n1 = ZREC_SPLIT(*n);
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const int nWorkl = (kv > n1) ? n1 : kv;
const int mWorku = (*kl > n1) ? n1 : *kl;
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double));
double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double));
LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(zlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_zgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
// Free work space
free(Workl);
free(Worku);
}
/** zgbtrf's recursive compute kernel */
static void RELAPACK_zgbtrf_rec(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
int *info
) {
if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) {
// Unblocked
LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Loop iterators
int i, j;
// Output upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + 2 * kv;
// Splitting
const int n1 = MIN(ZREC_SPLIT(*n), *kl);
const int n2 = *n - n1;
const int m1 = MIN(n1, *m);
const int m2 = *m - m1;
const int mn1 = MIN(m1, n1);
const int mn2 = MIN(m2, n2);
// Ab_L *
// Ab_BR
double *const Ab_L = Ab;
double *const Ab_BR = Ab + 2 * *ldAb * n1;
// A_L A_R
double *const A_L = A;
double *const A_R = A + 2 * *ldA * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * m1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * m1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// Banded splitting
const int n21 = MIN(n2, kv - n1);
const int n22 = MIN(n2 - n21, n1);
const int m21 = MIN(m2, *kl - m1);
const int m22 = MIN(m2 - m21, m1);
// n1 n21 n22
// m * A_Rl ARr
double *const A_Rl = A_R;
double *const A_Rr = A_R + 2 * *ldA * n21;
// n1 n21 n22
// m1 * A_TRl A_TRr
// m21 A_BLt A_BRtl A_BRtr
// m22 A_BLb A_BRbl A_BRbr
double *const A_TRl = A_TR;
double *const A_TRr = A_TR + 2 * *ldA * n21;
double *const A_BLt = A_BL;
double *const A_BLb = A_BL + 2 * m21;
double *const A_BRtl = A_BR;
double *const A_BRtr = A_BR + 2 * *ldA * n21;
double *const A_BRbl = A_BR + 2 * m21;
double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
// Workl = A_BLb
LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
// partially redo swaps in A_L
for (i = 0; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
else
BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
}
}
// apply pivots to A_Rl
LAPACK(zlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
// apply pivots to A_Rr columnwise
for (j = 0; j < n22; j++) {
double *const A_Rrj = A_Rr + 2 * *ldA * j;
for (i = j; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
const double tmpr = A_Rrj[2 * i];
const double tmpc = A_Rrj[2 * i + 1];
A_Rrj[2 * i] = A_Rrj[2 * ip];
A_Rrj[2 * i + 1] = A_Rrj[2 * ip + 1];
A_Rrj[2 * ip] = tmpr;
A_Rrj[2 * ip + 1] = tmpc;
}
}
}
// A_TRl = A_TL \ A_TRl
BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// Worku = A_TRr
LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
// Worku = A_TL \ Worku
BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
// A_TRr = Worku
LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
// A_BRtl = A_BRtl - A_BLt * A_TRl
BLAS(zgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
// A_BRbl = A_BRbl - Workl * A_TRl
BLAS(zgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
// A_BRtr = A_BRtr - A_BLt * Worku
BLAS(zgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Workl * Worku
BLAS(zgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
// partially undo swaps in A_L
for (i = mn1 - 1; i >= 0; i--) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
else
BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
if (*info)
*info += n1;
// shift pivots
for (i = 0; i < mn2; i++)
ipiv_B[i] += n1;
}

167
relapack/src/zgemmt.c Normal file
View File

@@ -0,0 +1,167 @@
#include "relapack.h"
static void RELAPACK_zgemmt_rec(const char *, const char *, const char *,
const int *, const int *, const double *, const double *, const int *,
const double *, const int *, const double *, double *, const int *);
static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *,
const int *, const int *, const double *, const double *, const int *,
const double *, const int *, const double *, double *, const int *);
/** ZGEMMT computes a matrix-matrix product with general matrices but updates
* only the upper or lower triangular part of the result matrix.
*
* This routine performs the same operation as the BLAS routine
* zgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
* but only updates the triangular part of C specified by uplo:
* If (*uplo == 'L'), only the lower triangular part of C is updated,
* otherwise the upper triangular part is updated.
* */
void RELAPACK_zgemmt(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
#if HAVE_XGEMMT
BLAS(zgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
#else
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int notransA = LAPACK(lsame)(transA, "N");
const int tranA = LAPACK(lsame)(transA, "T");
const int ctransA = LAPACK(lsame)(transA, "C");
const int notransB = LAPACK(lsame)(transB, "N");
const int tranB = LAPACK(lsame)(transB, "T");
const int ctransB = LAPACK(lsame)(transB, "C");
int info = 0;
if (!lower && !upper)
info = 1;
else if (!tranA && !ctransA && !notransA)
info = 2;
else if (!tranB && !ctransB && !notransB)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*ldA < MAX(1, notransA ? *n : *k))
info = 8;
else if (*ldB < MAX(1, notransB ? *k : *n))
info = 10;
else if (*ldC < MAX(1, *n))
info = 13;
if (info) {
LAPACK(xerbla)("ZGEMMT", &info);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');
// Recursive kernel
RELAPACK_zgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
#endif
}
/** zgemmt's recursive compute kernel */
static void RELAPACK_zgemmt_rec(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) {
// Unblocked
RELAPACK_zgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
}
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// A_T
// A_B
const double *const A_T = A;
const double *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);
// B_L B_R
const double *const B_L = B;
const double *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);
// C_TL C_TR
// C_BL C_BR
double *const C_TL = C;
double *const C_TR = C + 2 * *ldC * n1;
double *const C_BL = C + 2 * n1;
double *const C_BR = C + 2 * *ldC * n1 + 2 * n1;
// recursion(C_TL)
RELAPACK_zgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
if (*uplo == 'L')
// C_BL = alpha A_B B_L + beta C_BL
BLAS(zgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
else
// C_TR = alpha A_T B_R + beta C_TR
BLAS(zgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
// recursion(C_BR)
RELAPACK_zgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
}
/** zgemmt's unblocked compute kernel */
static void RELAPACK_zgemmt_rec2(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
const int incB = (*transB == 'N') ? 1 : *ldB;
const int incC = 1;
int i;
for (i = 0; i < *n; i++) {
// A_0
// A_i
const double *const A_0 = A;
const double *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);
// * B_i *
const double *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);
// * C_0i *
// * C_ii *
double *const C_0i = C + 2 * *ldC * i;
double *const C_ii = C + 2 * *ldC * i + 2 * i;
if (*uplo == 'L') {
const int nmi = *n - i;
if (*transA == 'N')
BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
else
BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
} else {
const int ip1 = i + 1;
if (*transA == 'N')
BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
else
BLAS(zgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
}
}
}

117
relapack/src/zgetrf.c Normal file
View File

@@ -0,0 +1,117 @@
#include "relapack.h"
static void RELAPACK_zgetrf_rec(const int *, const int *, double *,
const int *, int *, int *);
/** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's zgetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html
* */
void RELAPACK_zgetrf(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZGETRF", &minfo);
return;
}
const int sn = MIN(*m, *n);
RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
if (*m < *n) {
// Constants
const double ONE[] = { 1., 0. };
const int iONE[] = { 1 };
// Splitting
const int rn = *n - *m;
// A_L A_R
const double *const A_L = A;
double *const A_R = A + 2 * *ldA * *m;
// A_R = apply(ipiv, A_R)
LAPACK(zlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
// A_R = A_L \ A_R
BLAS(ztrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
}
}
/** zgetrf's recursive compute kernel */
static void RELAPACK_zgetrf_rec(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
if (*n <= MAX(CROSSOVER_ZGETRF, 1)) {
// Unblocked
LAPACK(zgetf2)(m, n, A, ldA, ipiv, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1. };
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
const int m2 = *m - n1;
// A_L A_R
double *const A_L = A;
double *const A_R = A + 2 * *ldA * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// recursion(A_L, ipiv_T)
RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
// apply pivots to A_R
LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
// A_TR = A_TL \ A_TR
BLAS(ztrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_BL * A_TR
BLAS(zgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
// recursion(A_BR, ipiv_B)
RELAPACK_zgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
if (*info)
*info += n1;
// apply pivots to A_BL
LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
// shift pivots
int i;
for (i = 0; i < n2; i++)
ipiv_B[i] += n1;
}

212
relapack/src/zhegst.c Normal file
View File

@@ -0,0 +1,212 @@
#include "relapack.h"
#if XSYGST_ALLOW_MALLOC
#include "stdlib.h"
#endif
static void RELAPACK_zhegst_rec(const int *, const char *, const int *,
double *, const int *, const double *, const int *,
double *, const int *, int *);
/** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
*
* This routine is functionally equivalent to LAPACK's zhegst.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html
* */
void RELAPACK_zhegst(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (*itype < 1 || *itype > 3)
*info = -1;
else if (!lower && !upper)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
else if (*ldB < MAX(1, *n))
*info = -7;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZHEGST", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Allocate work space
double *Work = NULL;
int lWork = 0;
#if XSYGST_ALLOW_MALLOC
const int n1 = ZREC_SPLIT(*n);
lWork = n1 * (*n - n1);
Work = malloc(lWork * 2 * sizeof(double));
if (!Work)
lWork = 0;
#endif
// recursive kernel
RELAPACK_zhegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
// Free work space
#if XSYGST_ALLOW_MALLOC
if (Work)
free(Work);
#endif
}
/** zhegst's recursive compute kernel */
static void RELAPACK_zhegst_rec(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
double *Work, const int *lWork, int *info
) {
if (*n <= MAX(CROSSOVER_ZHEGST, 1)) {
// Unblocked
LAPACK(zhegs2)(itype, uplo, n, A, ldA, B, ldB, info);
return;
}
// Constants
const double ZERO[] = { 0., 0. };
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const double HALF[] = { .5, 0. };
const double MHALF[] = { -.5, 0. };
const int iONE[] = { 1 };
// Loop iterator
int i;
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// B_TL B_TR
// B_BL B_BR
const double *const B_TL = B;
const double *const B_TR = B + 2 * *ldB * n1;
const double *const B_BL = B + 2 * n1;
const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// recursion(A_TL, B_TL)
RELAPACK_zhegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
if (*itype == 1)
if (*uplo == 'L') {
// A_BL = A_BL / B_TL'
BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork >= n2 * n1) {
// T = -1/2 * B_BL * A_TL
BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
} else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
BLAS(zher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
if (*lWork >= n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR \ A_BL
BLAS(ztrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL' \ A_TR
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork >= n2 * n1) {
// T = -1/2 * A_TL * B_TR
BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
} else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
BLAS(zher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
if (*lWork >= n2 * n1)
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR / B_BR
BLAS(ztrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
else
if (*uplo == 'L') {
// A_BL = A_BL * B_TL
BLAS(ztrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork >= n2 * n1) {
// T = 1/2 * A_BR * B_BL
BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
} else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
BLAS(zher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
if (*lWork >= n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR * A_BL
BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL * A_TR
BLAS(ztrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork >= n2 * n1) {
// T = 1/2 * B_TR * A_BR
BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
} else
// A_TR = A_TR + 1/2 B_TR A_BR
BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
BLAS(zher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
if (*lWork >= n2 * n1)
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
else
// A_TR = A_TR + 1/2 B_TR * A_BR
BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR * B_BR
BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
// recursion(A_BR, B_BR)
RELAPACK_zhegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
}

236
relapack/src/zhetrf.c Normal file
View File

@@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's zhetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html
* */
void RELAPACK_zhetrf(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZHETRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** zhetrf's recursive compute kernel */
static void RELAPACK_zhetrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_ZHETRF, 3)) {
// Unblocked
if (top) {
LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_zhetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = ZREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
double *const A_BL_B = A + 2 * *n;
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + 2 * n1;
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = ZREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + 2 * *ldA * n_rest;
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

524
relapack/src/zhetrf_rec2.c Normal file
View File

@@ -0,0 +1,524 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static doublecomplex c_b1 = {1.,0.};
static int c__1 = 1;
/** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
*
* This routine is a minor modification of LAPACK's zlahef.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int *
nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
double d__1, d__2, d__3, d__4;
doublecomplex z__1, z__2, z__3, z__4;
/* Builtin functions */
double sqrt(double), d_imag(doublecomplex *);
void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *,
doublecomplex *, doublecomplex *);
/* Local variables */
static int j, k;
static double t, r1;
static doublecomplex d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static double alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
static int kstep;
extern /* Subroutine */ int zgemv_(char *, int *, int *,
doublecomplex *, doublecomplex *, int *, doublecomplex *,
int *, doublecomplex *, doublecomplex *, int *, ftnlen),
zcopy_(int *, doublecomplex *, int *, doublecomplex *,
int *), zswap_(int *, doublecomplex *, int *,
doublecomplex *, int *);
static double absakk;
extern /* Subroutine */ int zdscal_(int *, double *,
doublecomplex *, int *);
static double colmax;
extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
;
extern int izamax_(int *, doublecomplex *, int *);
static double rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
i__1 = k - 1;
zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k + kw * w_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
i__1 = k + kw * w_dim1;
i__2 = k + kw * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
i__1 = k + kw * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1));
if (k > 1) {
i__1 = k - 1;
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
kw * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - 1;
zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + imax * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
i__1 = k - imax;
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
i__1 = k - imax;
zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + (kw - 1) * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
i__1 = k - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
jmax + (kw - 1) * w_dim1]), abs(d__2));
if (imax > 1) {
i__1 = imax - 1;
jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
i__1 = jmax + (kw - 1) * w_dim1;
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs(
d__2));
rowmax = max(d__3,d__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (kw - 1) * w_dim1;
if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) {
kp = imax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = kk - 1 - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
i__1 = kk - 1 - kp;
zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
r1 = 1. / a[i__1].r;
i__1 = k - 1;
zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
i__1 = k - 1;
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
d_cnjg(&z__2, &d21);
z_div(&z__1, &w[k + kw * w_dim1], &z__2);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = z__1.r, d22.i = z__1.i;
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1. / (z__1.r - 1.);
z__2.r = t, z__2.i = 0.;
z_div(&z__1, &z__2, &d21);
d21.r = z__1.r, d21.i = z__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + k * a_dim1;
d_cnjg(&z__2, &d21);
i__3 = j + kw * w_dim1;
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
z__2.r * z__3.i + z__2.i * z__3.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1;
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k - 2;
zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
i__1 = k + k * w_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (k < *n) {
i__1 = *n - k;
zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
w_dim1], &c__1);
}
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
i__1 = k + k * w_dim1;
i__2 = k + k * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
i__1 = k + k * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1));
if (k < *n) {
i__1 = *n - k;
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
k * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = imax - k;
zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + imax * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (imax < *n) {
i__1 = *n - imax;
zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
imax + 1 + (k + 1) * w_dim1], &c__1);
}
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + (k + 1) * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
i__1 = imax - k;
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
jmax + (k + 1) * w_dim1]), abs(d__2));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
i__1 = jmax + (k + 1) * w_dim1;
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs(
d__2));
rowmax = max(d__3,d__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (k + 1) * w_dim1;
if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) {
kp = imax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
k * w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k + kstep - 1;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = kp - kk - 1;
zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
i__1 = kp - kk - 1;
zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
r1 = 1. / a[i__1].r;
i__1 = *n - k;
zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
i__1 = *n - k;
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = z__1.r, d11.i = z__1.i;
d_cnjg(&z__2, &d21);
z_div(&z__1, &w[k + k * w_dim1], &z__2);
d22.r = z__1.r, d22.i = z__1.i;
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1. / (z__1.r - 1.);
z__2.r = t, z__2.i = 0.;
z_div(&z__1, &z__2, &d21);
d21.r = z__1.r, d21.i = z__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
d_cnjg(&z__2, &d21);
i__3 = j + k * w_dim1;
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
z__2.r * z__3.i + z__2.i * z__3.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = *n - k;
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = *n - k - 1;
zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/zhetrf_rook.c Normal file
View File

@@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's zhetrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html
* */
void RELAPACK_zhetrf_rook(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZHETRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** zhetrf_rook's recursive compute kernel */
static void RELAPACK_zhetrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_zhetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = ZREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
double *const A_BL_B = A + 2 * *n;
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + 2 * n1;
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = ZREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + 2 * *ldA * n_rest;
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@@ -0,0 +1,662 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static doublecomplex c_b1 = {1.,0.};
static int c__1 = 1;
/** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
*
* This routine is a minor modification of LAPACK's zlahef_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, doublecomplex *a, int *lda, int *
ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
double d__1, d__2;
doublecomplex z__1, z__2, z__3, z__4, z__5;
/* Builtin functions */
double sqrt(double), d_imag(doublecomplex *);
void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *,
doublecomplex *, doublecomplex *);
/* Local variables */
static int j, k, p;
static double t, r1;
static doublecomplex d11, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static double alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
static double dtemp, sfmin;
static int itemp, kstep;
extern /* Subroutine */ int zgemv_(char *, int *, int *,
doublecomplex *, doublecomplex *, int *, doublecomplex *,
int *, doublecomplex *, doublecomplex *, int *, ftnlen),
zcopy_(int *, doublecomplex *, int *, doublecomplex *,
int *), zswap_(int *, doublecomplex *, int *,
doublecomplex *, int *);
extern double dlamch_(char *, ftnlen);
static double absakk;
extern /* Subroutine */ int zdscal_(int *, double *,
doublecomplex *, int *);
static double colmax;
extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
;
extern int izamax_(int *, doublecomplex *, int *);
static double rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
sfmin = dlamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
if (k > 1) {
i__1 = k - 1;
zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
c__1);
}
i__1 = k + kw * w_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
i__1 = k + kw * w_dim1;
i__2 = k + kw * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
i__1 = k + kw * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1));
if (k > 1) {
i__1 = k - 1;
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
kw * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
d__1 = w[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
if (k > 1) {
i__1 = k - 1;
zcopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
&c__1);
}
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
if (imax > 1) {
i__1 = imax - 1;
zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
}
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + imax * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
i__1 = k - imax;
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
i__1 = k - imax;
zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + (kw - 1) * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
w[jmax + (kw - 1) * w_dim1]), abs(d__2));
} else {
rowmax = 0.;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
i__1 = itemp + (kw - 1) * w_dim1;
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
itemp + (kw - 1) * w_dim1]), abs(d__2));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
i__1 = imax + (kw - 1) * w_dim1;
if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) {
kp = imax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = p + p * a_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = k - 1 - p;
zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
i__1 = k - 1 - p;
zlacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
if (p > 1) {
i__1 = p - 1;
zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
zswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = kk - 1 - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
i__1 = kk - 1 - kp;
zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
t = a[i__1].r;
if (abs(t) >= sfmin) {
r1 = 1. / t;
i__1 = k - 1;
zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
i__3 = ii + k * a_dim1;
z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L14: */
}
}
i__1 = k - 1;
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
d_cnjg(&z__2, &d21);
z_div(&z__1, &w[k + kw * w_dim1], &z__2);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = z__1.r, d22.i = z__1.i;
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1. / (z__1.r - 1.);
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d21);
z__1.r = t * z__2.r, z__1.i = t * z__2.i;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
d_cnjg(&z__5, &d21);
z_div(&z__2, &z__3, &z__5);
z__1.r = t * z__2.r, z__1.i = t * z__2.i;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1;
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k - 2;
zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
++jj;
if (kstep == 2 && jp1 != jj && j <= *n) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = k + k * w_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (k < *n) {
i__1 = *n - k;
zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
w_dim1], &c__1);
}
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
i__1 = k + k * w_dim1;
i__2 = k + k * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
i__1 = k + k * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1));
if (k < *n) {
i__1 = *n - k;
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
k * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
d__1 = w[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
if (k < *n) {
i__1 = *n - k;
zcopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
a_dim1], &c__1);
}
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = imax - k;
zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + imax * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (imax < *n) {
i__1 = *n - imax;
zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
imax + 1 + (k + 1) * w_dim1], &c__1);
}
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
1) * w_dim1], &c__1, (ftnlen)12);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + (k + 1) * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
w[jmax + (k + 1) * w_dim1]), abs(d__2));
} else {
rowmax = 0.;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
i__1 = itemp + (k + 1) * w_dim1;
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
itemp + (k + 1) * w_dim1]), abs(d__2));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
i__1 = imax + (k + 1) * w_dim1;
if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) {
kp = imax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p + p * a_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = p - k - 1;
zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
a_dim1], lda);
i__1 = p - k - 1;
zlacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
if (p < *n) {
i__1 = *n - p;
zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
* a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
zswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
}
zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = kp - kk - 1;
zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
i__1 = kp - kk - 1;
zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
t = a[i__1].r;
if (abs(t) >= sfmin) {
r1 = 1. / t;
i__1 = *n - k;
zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
i__3 = ii + k * a_dim1;
z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L74: */
}
}
i__1 = *n - k;
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = z__1.r, d11.i = z__1.i;
d_cnjg(&z__2, &d21);
z_div(&z__1, &w[k + k * w_dim1], &z__2);
d22.r = z__1.r, d22.i = z__1.i;
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1. / (z__1.r - 1.);
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
d_cnjg(&z__5, &d21);
z_div(&z__2, &z__3, &z__5);
z__1.r = t * z__2.r, z__1.i = t * z__2.i;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d21);
z__1.r = t * z__2.r, z__1.i = t * z__2.i;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = *n - k;
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = *n - k - 1;
zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
--jj;
if (kstep == 2 && jp1 != jj && j >= 1) {
zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

87
relapack/src/zlauum.c Normal file
View File

@@ -0,0 +1,87 @@
#include "relapack.h"
static void RELAPACK_zlauum_rec(const char *, const int *, double *,
const int *, int *);
/** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
*
* This routine is functionally equivalent to LAPACK's zlauum.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html
* */
void RELAPACK_zlauum(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZLAUUM", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_zlauum_rec(&cleanuplo, n, A, ldA, info);
}
/** zlauum's recursive compute kernel */
static void RELAPACK_zlauum_rec(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) {
// Unblocked
LAPACK(zlauu2)(uplo, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_zlauum_rec(uplo, &n1, A_TL, ldA, info);
if (*uplo == 'L') {
// A_TL = A_TL + A_BL' * A_BL
BLAS(zherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
// A_BL = A_BR' * A_BL
BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TL = A_TL + A_TR * A_TR'
BLAS(zherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
// A_TR = A_TR * A_BR'
BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_zlauum_rec(uplo, &n2, A_BR, ldA, info);
}

157
relapack/src/zpbtrf.c Normal file
View File

@@ -0,0 +1,157 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *,
double *, const int *, double *, const int *, int *);
/** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
*
* This routine is functionally equivalent to LAPACK's zpbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html
* */
void RELAPACK_zpbtrf(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kd < 0)
*info = -3;
else if (*ldAb < *kd + 1)
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZPBTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Constant
const double ZERO[] = { 0., 0. };
// Allocate work space
const int n1 = ZREC_SPLIT(*n);
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
double *Work = malloc(mWork * nWork * 2 * sizeof(double));
LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
// Recursive kernel
RELAPACK_zpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
// Free work space
free(Work);
}
/** zpbtrf's recursive compute kernel */
static void RELAPACK_zpbtrf_rec(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
double *Work, const int *ldWork,
int *info
){
if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) {
// Unblocked
LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
// Splitting
const int n1 = MIN(ZREC_SPLIT(*n), *kd);
const int n2 = *n - n1;
// * *
// * Ab_BR
double *const Ab_BR = Ab + 2 * *ldAb * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_zpotrf(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
// Banded splitting
const int n21 = MIN(n2, *kd - n1);
const int n22 = MIN(n2 - n21, *kd);
// n1 n21 n22
// n1 * A_TRl A_TRr
// n21 A_BLt A_BRtl A_BRtr
// n22 A_BLb A_BRbl A_BRbr
double *const A_TRl = A_TR;
double *const A_TRr = A_TR + 2 * *ldA * n21;
double *const A_BLt = A_BL;
double *const A_BLb = A_BL + 2 * n21;
double *const A_BRtl = A_BR;
double *const A_BRtr = A_BR + 2 * *ldA * n21;
double *const A_BRbl = A_BR + 2 * n21;
double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21;
if (*uplo == 'L') {
// A_BLt = ABLt / A_TL'
BLAS(ztrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
// A_BRtl = A_BRtl - A_BLt * A_BLt'
BLAS(zherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
// Work = A_BLb
LAPACK(zlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
// Work = Work / A_TL'
BLAS(ztrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
// A_BRbl = A_BRbl - Work * A_BLt'
BLAS(zgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
// A_BRbr = A_BRbr - Work * Work'
BLAS(zherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_BLb = Work
LAPACK(zlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
} else {
// A_TRl = A_TL' \ A_TRl
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// A_BRtl = A_BRtl - A_TRl' * A_TRl
BLAS(zherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
// Work = A_TRr
LAPACK(zlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
// Work = A_TL' \ Work
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
// A_BRtr = A_BRtr - A_TRl' * Work
BLAS(zgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Work' * Work
BLAS(zherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_TRr = Work
LAPACK(zlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
}
// recursion(A_BR)
if (*kd > n1)
RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info);
else
RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
if (*info)
*info += n1;
}

92
relapack/src/zpotrf.c Normal file
View File

@@ -0,0 +1,92 @@
#include "relapack.h"
static void RELAPACK_zpotrf_rec(const char *, const int *, double *,
const int *, int *);
/** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
*
* This routine is functionally equivalent to LAPACK's zpotrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html
* */
void RELAPACK_zpotrf(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZPOTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_zpotrf_rec(&cleanuplo, n, A, ldA, info);
}
/** zpotrf's recursive compute kernel */
static void RELAPACK_zpotrf_rec(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) {
// Unblocked
LAPACK(zpotf2)(uplo, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_zpotrf_rec(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = A_BL / A_TL'
BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
// A_BR = A_BR - A_BL * A_BL'
BLAS(zherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
} else {
// A_TR = A_TL' \ A_TR
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_TR' * A_TR
BLAS(zherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
}
// recursion(A_BR)
RELAPACK_zpotrf_rec(uplo, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

238
relapack/src/zsytrf.c Normal file
View File

@@ -0,0 +1,238 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's zsytrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html
* */
void RELAPACK_zsytrf(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy arguments
int nout;
// Recursive kernel
RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** zsytrf's recursive compute kernel */
static void RELAPACK_zsytrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) {
// Unblocked
if (top) {
LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_zsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Loop iterator
int i;
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = ZREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
double *const A_BL_B = A + 2 * *n;
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + 2 * n1;
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = ZREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + 2 * *ldA * n_rest;
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

452
relapack/src/zsytrf_rec2.c Normal file
View File

@@ -0,0 +1,452 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static doublecomplex c_b1 = {1.,0.};
static int c__1 = 1;
/** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
*
* This routine is a minor modification of LAPACK's zlasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int *
nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
double d__1, d__2, d__3, d__4;
doublecomplex z__1, z__2, z__3;
/* Builtin functions */
double sqrt(double), d_imag(doublecomplex *);
void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
/* Local variables */
static int j, k;
static doublecomplex t, r1, d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static double alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int zscal_(int *, doublecomplex *,
doublecomplex *, int *);
static int kstep;
extern /* Subroutine */ int zgemv_(char *, int *, int *,
doublecomplex *, doublecomplex *, int *, doublecomplex *,
int *, doublecomplex *, doublecomplex *, int *, ftnlen),
zcopy_(int *, doublecomplex *, int *, doublecomplex *,
int *), zswap_(int *, doublecomplex *, int *,
doublecomplex *, int *);
static double absakk, colmax;
extern int izamax_(int *, doublecomplex *, int *);
static double rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
kstep = 1;
i__1 = k + kw * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw *
w_dim1]), abs(d__2));
if (k > 1) {
i__1 = k - 1;
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
kw * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
i__1 = k - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
jmax + (kw - 1) * w_dim1]), abs(d__2));
if (imax > 1) {
i__1 = imax - 1;
jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
i__1 = jmax + (kw - 1) * w_dim1;
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs(
d__2));
rowmax = max(d__3,d__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (kw - 1) * w_dim1;
if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
imax + (kw - 1) * w_dim1]), abs(d__2)) >= alpha *
rowmax) {
kp = imax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kk - 1 - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
r1.r = z__1.r, r1.i = z__1.i;
i__1 = k - 1;
zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
z_div(&z__1, &w[k + kw * w_dim1], &d21);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = z__1.r, d22.i = z__1.i;
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
d22.i + d11.i * d22.r;
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
z_div(&z__1, &c_b1, &z__2);
t.r = z__1.r, t.i = z__1.i;
z_div(&z__1, &t, &d21);
d21.r = z__1.r, d21.i = z__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
i__1 = *n - k + 1;
zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
kstep = 1;
i__1 = k + k * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k *
w_dim1]), abs(d__2));
if (k < *n) {
i__1 = *n - k;
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
k * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax - k;
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
jmax + (k + 1) * w_dim1]), abs(d__2));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
i__1 = jmax + (k + 1) * w_dim1;
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs(
d__2));
rowmax = max(d__3,d__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (k + 1) * w_dim1;
if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
imax + (k + 1) * w_dim1]), abs(d__2)) >= alpha *
rowmax) {
kp = imax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
k * w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k + kstep - 1;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kp - kk - 1;
zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
r1.r = z__1.r, r1.i = z__1.i;
i__1 = *n - k;
zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k + k * w_dim1], &d21);
d22.r = z__1.r, d22.i = z__1.i;
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
d22.i + d11.i * d22.r;
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
z_div(&z__1, &c_b1, &z__2);
t.r = z__1.r, t.i = z__1.i;
z_div(&z__1, &t, &d21);
d21.r = z__1.r, d21.i = z__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/zsytrf_rook.c Normal file
View File

@@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's zsytrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html
* */
void RELAPACK_zsytrf_rook(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** zsytrf_rook's recursive compute kernel */
static void RELAPACK_zsytrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_zsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = ZREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
double *const A_BL_B = A + 2 * *n;
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + 2 * n1;
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = ZREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + 2 * *ldA * n_rest;
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@@ -0,0 +1,561 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static doublecomplex c_b1 = {1.,0.};
static int c__1 = 1;
/** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
*
* This routine is a minor modification of LAPACK's zlasyf_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, doublecomplex *a, int *lda, int *
ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
double d__1, d__2;
doublecomplex z__1, z__2, z__3, z__4;
/* Builtin functions */
double sqrt(double), d_imag(doublecomplex *);
void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
/* Local variables */
static int j, k, p;
static doublecomplex t, r1, d11, d12, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static double alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
static double dtemp, sfmin;
extern /* Subroutine */ int zscal_(int *, doublecomplex *,
doublecomplex *, int *);
static int itemp, kstep;
extern /* Subroutine */ int zgemv_(char *, int *, int *,
doublecomplex *, doublecomplex *, int *, doublecomplex *,
int *, doublecomplex *, doublecomplex *, int *, ftnlen),
zcopy_(int *, doublecomplex *, int *, doublecomplex *,
int *), zswap_(int *, doublecomplex *, int *,
doublecomplex *, int *);
extern double dlamch_(char *, ftnlen);
static double absakk, colmax;
extern int izamax_(int *, doublecomplex *, int *);
static double rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
sfmin = dlamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
i__1 = k + kw * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw *
w_dim1]), abs(d__2));
if (k > 1) {
i__1 = k - 1;
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
kw * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
w[jmax + (kw - 1) * w_dim1]), abs(d__2));
} else {
rowmax = 0.;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
i__1 = itemp + (kw - 1) * w_dim1;
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
itemp + (kw - 1) * w_dim1]), abs(d__2));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
i__1 = imax + (kw - 1) * w_dim1;
if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax
+ (kw - 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) {
kp = imax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = k - p;
zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
zcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
c__1);
i__1 = *n - k + 1;
zswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
lda);
i__1 = *n - kk + 1;
zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
i__1 = kp + k * a_dim1;
i__2 = kk + k * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = k - 1 - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
zcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
c__1);
i__1 = *n - kk + 1;
zswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
lda);
i__1 = *n - kk + 1;
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k +
k * a_dim1]), abs(d__2)) >= sfmin) {
z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
r1.r = z__1.r, r1.i = z__1.i;
i__1 = k - 1;
zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else /* if(complicated condition) */ {
i__1 = k + k * a_dim1;
if (a[i__1].r != 0. || a[i__1].i != 0.) {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
z_div(&z__1, &a[ii + k * a_dim1], &a[k + k *
a_dim1]);
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L14: */
}
}
}
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d12.r = w[i__1].r, d12.i = w[i__1].i;
z_div(&z__1, &w[k + kw * w_dim1], &d12);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
d22.r = z__1.r, d22.i = z__1.i;
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
d22.i + d11.i * d22.r;
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
z_div(&z__1, &c_b1, &z__2);
t.r = z__1.r, t.i = z__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d12);
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
z__2.i + t.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d12);
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
z__2.i + t.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
jj = j - 1;
if (jp1 != jj && kstep == 2) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = *n - k + 1;
zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
}
i__1 = k + k * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k *
w_dim1]), abs(d__2));
if (k < *n) {
i__1 = *n - k;
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
k * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
1) * w_dim1], &c__1, (ftnlen)12);
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
w[jmax + (k + 1) * w_dim1]), abs(d__2));
} else {
rowmax = 0.;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
i__1 = itemp + (k + 1) * w_dim1;
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
itemp + (k + 1) * w_dim1]), abs(d__2));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
i__1 = imax + (k + 1) * w_dim1;
if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax
+ (k + 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) {
kp = imax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p - k;
zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
lda);
i__1 = *n - p + 1;
zcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
c__1);
zswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
i__1 = kp + k * a_dim1;
i__2 = kk + k * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kp - k - 1;
zcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
* a_dim1], lda);
i__1 = *n - kp + 1;
zcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
a_dim1], &c__1);
zswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k +
k * a_dim1]), abs(d__2)) >= sfmin) {
z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
r1.r = z__1.r, r1.i = z__1.i;
i__1 = *n - k;
zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else /* if(complicated condition) */ {
i__1 = k + k * a_dim1;
if (a[i__1].r != 0. || a[i__1].i != 0.) {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
z_div(&z__1, &a[ii + k * a_dim1], &a[k + k *
a_dim1]);
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L74: */
}
}
}
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k + k * w_dim1], &d21);
d22.r = z__1.r, d22.i = z__1.i;
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
d22.i + d11.i * d22.r;
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
z_div(&z__1, &c_b1, &z__2);
t.r = z__1.r, t.i = z__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d21);
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
z__2.i + t.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d21);
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
z__2.i + t.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
jj = j + 1;
if (jp1 != jj && kstep == 2) {
zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

268
relapack/src/ztgsyl.c Normal file
View File

@@ -0,0 +1,268 @@
#include "relapack.h"
#include <math.h>
static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *,
const int *, const double *, const int *, const double *, const int *,
double *, const int *, const double *, const int *, const double *,
const int *, double *, const int *, double *, double *, double *, int *);
/** ZTGSYL solves the generalized Sylvester equation.
*
* This routine is functionally equivalent to LAPACK's ztgsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html
* */
void RELAPACK_ztgsyl(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
// Parse arguments
const int notran = LAPACK(lsame)(trans, "N");
const int tran = LAPACK(lsame)(trans, "C");
// Compute work buffer size
int lwmin = 1;
if (notran && (*ijob == 1 || *ijob == 2))
lwmin = MAX(1, 2 * *m * *n);
*info = 0;
// Check arguments
if (!tran && !notran)
*info = -1;
else if (notran && (*ijob < 0 || *ijob > 4))
*info = -2;
else if (*m <= 0)
*info = -3;
else if (*n <= 0)
*info = -4;
else if (*ldA < MAX(1, *m))
*info = -6;
else if (*ldB < MAX(1, *n))
*info = -8;
else if (*ldC < MAX(1, *m))
*info = -10;
else if (*ldD < MAX(1, *m))
*info = -12;
else if (*ldE < MAX(1, *n))
*info = -14;
else if (*ldF < MAX(1, *m))
*info = -16;
else if (*lWork < lwmin && *lWork != -1)
*info = -20;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZTGSYL", &minfo);
return;
}
if (*lWork == -1) {
// Work size query
*Work = lwmin;
return;
}
// Clean char * arguments
const char cleantrans = notran ? 'N' : 'C';
// Constant
const double ZERO[] = { 0., 0. };
int isolve = 1;
int ifunc = 0;
if (notran) {
if (*ijob >= 3) {
ifunc = *ijob - 2;
LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF);
} else if (*ijob >= 1)
isolve = 2;
}
double scale2;
int iround;
for (iround = 1; iround <= isolve; iround++) {
*scale = 1;
double dscale = 0;
double dsum = 1;
RELAPACK_ztgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
if (dscale != 0) {
if (*ijob == 1 || *ijob == 3)
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
else
*dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
}
if (isolve == 2) {
if (iround == 1) {
if (notran)
ifunc = *ijob;
scale2 = *scale;
LAPACK(zlacpy)("F", m, n, C, ldC, Work, m);
LAPACK(zlacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF);
} else {
LAPACK(zlacpy)("F", m, n, Work, m, C, ldC);
LAPACK(zlacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
*scale = scale2;
}
}
}
}
/** ztgsyl's recursive vompute kernel */
static void RELAPACK_ztgsyl_rec(
const char *trans, const int *ifunc, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dsum, double *dscale,
int *info
) {
if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) {
// Unblocked
LAPACK(ztgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Outputs
double scale1[] = { 1., 0. };
double scale2[] = { 1., 0. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
const int m1 = ZREC_SPLIT(*m);
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const double *const A_TL = A;
const double *const A_TR = A + 2 * *ldA * m1;
const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
// C_T
// C_B
double *const C_T = C;
double *const C_B = C + 2 * m1;
// D_TL D_TR
// 0 D_BR
const double *const D_TL = D;
const double *const D_TR = D + 2 * *ldD * m1;
const double *const D_BR = D + 2 * *ldD * m1 + 2 * m1;
// F_T
// F_B
double *const F_T = F;
double *const F_B = F + 2 * m1;
if (*trans == 'N') {
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
// C_T = C_T - A_TR * C_B
BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// F_T = F_T - D_TR * C_B
BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
}
} else {
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
// C_B = C_B - A_TR^H * C_T
BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// C_B = C_B - D_TR^H * F_T
BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
}
}
} else {
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const double *const B_TL = B;
const double *const B_TR = B + 2 * *ldB * n1;
const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// C_L C_R
double *const C_L = C;
double *const C_R = C + 2 * *ldC * n1;
// E_TL E_TR
// 0 E_BR
const double *const E_TL = E;
const double *const E_TR = E + 2 * *ldE * n1;
const double *const E_BR = E + 2 * *ldE * n1 + 2 * n1;
// F_L F_R
double *const F_L = F;
double *const F_R = F + 2 * *ldF * n1;
if (*trans == 'N') {
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
// C_R = C_R + F_L * B_TR
BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
// F_R = F_R + F_L * E_TR
BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
}
} else {
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
// F_L = F_L + C_R * B_TR
BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
// F_L = F_L + F_R * E_TR
BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
}
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

163
relapack/src/ztrsyl.c Normal file
View File

@@ -0,0 +1,163 @@
#include "relapack.h"
static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *,
const int *, const int *, const double *, const int *, const double *,
const int *, double *, const int *, double *, int *);
/** ZTRSYL solves the complex Sylvester matrix equation.
*
* This routine is functionally equivalent to LAPACK's ztrsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html
* */
void RELAPACK_ztrsyl(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
// Check arguments
const int notransA = LAPACK(lsame)(tranA, "N");
const int ctransA = LAPACK(lsame)(tranA, "C");
const int notransB = LAPACK(lsame)(tranB, "N");
const int ctransB = LAPACK(lsame)(tranB, "C");
*info = 0;
if (!ctransA && !notransA)
*info = -1;
else if (!ctransB && !notransB)
*info = -2;
else if (*isgn != 1 && *isgn != -1)
*info = -3;
else if (*m < 0)
*info = -4;
else if (*n < 0)
*info = -5;
else if (*ldA < MAX(1, *m))
*info = -7;
else if (*ldB < MAX(1, *n))
*info = -9;
else if (*ldC < MAX(1, *m))
*info = -11;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZTRSYL", &minfo);
return;
}
// Clean char * arguments
const char cleantranA = notransA ? 'N' : 'C';
const char cleantranB = notransB ? 'N' : 'C';
// Recursive kernel
RELAPACK_ztrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
/** ztrsyl's recursive compute kernel */
static void RELAPACK_ztrsyl_rec(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) {
// Unblocked
RELAPACK_ztrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const double MSGN[] = { -*isgn, 0. };
const int iONE[] = { 1 };
// Outputs
double scale1[] = { 1., 0. };
double scale2[] = { 1., 0. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
const int m1 = ZREC_SPLIT(*m);
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const double *const A_TL = A;
const double *const A_TR = A + 2 * *ldA * m1;
const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
// C_T
// C_B
double *const C_T = C;
double *const C_B = C + 2 * m1;
if (*tranA == 'N') {
// recusion(A_BR, B, C_B)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
// C_T = C_T - A_TR * C_B
BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// recusion(A_TL, B, C_T)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
} else {
// recusion(A_TL, B, C_T)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
// C_B = C_B - A_TR' * C_T
BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// recusion(A_BR, B, C_B)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
}
} else {
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const double *const B_TL = B;
const double *const B_TR = B + 2 * *ldB * n1;
const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// C_L C_R
double *const C_L = C;
double *const C_R = C + 2 * *ldC * n1;
if (*tranB == 'N') {
// recusion(A, B_TL, C_L)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
// C_R = C_R -/+ C_L * B_TR
BLAS(zgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
// recusion(A, B_BR, C_R)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
} else {
// recusion(A, B_BR, C_R)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
// C_L = C_L -/+ C_R * B_TR'
BLAS(zgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
// recusion(A, B_TL, C_L)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

394
relapack/src/ztrsyl_rec2.c Normal file
View File

@@ -0,0 +1,394 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "../config.h"
#include "f2c.h"
#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
doublecomplex result;
zdotu_(&result, n, x, incx, y, incy);
return result;
}
#define zdotu_ zdotu_fun
doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
doublecomplex result;
zdotc_(&result, n, x, incx, y, incy);
return result;
}
#define zdotc_ zdotc_fun
#endif
#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) {
extern void zladiv_(doublecomplex *, doublecomplex *, doublecomplex *);
doublecomplex result;
zladiv_(&result, a, b);
return result;
}
#define zladiv_ zladiv_fun
#endif
/* Table of constant values */
static int c__1 = 1;
/** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
*
* This routine is an exact copy of LAPACK's ztrsyl.
* It serves as an unblocked kernel in the recursive algorithms.
* */
/* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int
*isgn, int *m, int *n, doublecomplex *a, int *lda,
doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc,
double *scale, int *info, ftnlen trana_len, ftnlen tranb_len)
{
/* System generated locals */
int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
i__3, i__4;
double d__1, d__2;
doublecomplex z__1, z__2, z__3, z__4;
/* Builtin functions */
double d_imag(doublecomplex *);
void d_cnjg(doublecomplex *, doublecomplex *);
/* Local variables */
static int j, k, l;
static doublecomplex a11;
static double db;
static doublecomplex x11;
static double da11;
static doublecomplex vec;
static double dum[1], eps, sgn, smin;
static doublecomplex suml, sumr;
extern int lsame_(char *, char *, ftnlen, ftnlen);
/* Double Complex */ doublecomplex zdotc_(int *,
doublecomplex *, int *, doublecomplex *, int *), zdotu_(
int *, doublecomplex *, int *,
doublecomplex *, int *);
extern /* Subroutine */ int dlabad_(double *, double *);
extern double dlamch_(char *, ftnlen);
static double scaloc;
extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
extern double zlange_(char *, int *, int *, doublecomplex *,
int *, double *, ftnlen);
static double bignum;
extern /* Subroutine */ int zdscal_(int *, double *,
doublecomplex *, int *);
/* Double Complex */ doublecomplex zladiv_(doublecomplex *,
doublecomplex *);
static int notrna, notrnb;
static double smlnum;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
b_dim1 = *ldb;
b_offset = 1 + b_dim1;
b -= b_offset;
c_dim1 = *ldc;
c_offset = 1 + c_dim1;
c__ -= c_offset;
/* Function Body */
notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
*info = 0;
if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
*info = -1;
} else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
*info = -2;
} else if (*isgn != 1 && *isgn != -1) {
*info = -3;
} else if (*m < 0) {
*info = -4;
} else if (*n < 0) {
*info = -5;
} else if (*lda < max(1,*m)) {
*info = -7;
} else if (*ldb < max(1,*n)) {
*info = -9;
} else if (*ldc < max(1,*m)) {
*info = -11;
}
if (*info != 0) {
i__1 = -(*info);
xerbla_("ZTRSY2", &i__1, (ftnlen)6);
return;
}
*scale = 1.;
if (*m == 0 || *n == 0) {
return;
}
eps = dlamch_("P", (ftnlen)1);
smlnum = dlamch_("S", (ftnlen)1);
bignum = 1. / smlnum;
dlabad_(&smlnum, &bignum);
smlnum = smlnum * (double) (*m * *n) / eps;
bignum = 1. / smlnum;
/* Computing MAX */
d__1 = smlnum, d__2 = eps * zlange_("M", m, m, &a[a_offset], lda, dum, (
ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * zlange_("M", n, n,
&b[b_offset], ldb, dum, (ftnlen)1);
smin = max(d__1,d__2);
sgn = (double) (*isgn);
if (notrna && notrnb) {
i__1 = *n;
for (l = 1; l <= i__1; ++l) {
for (k = *m; k >= 1; --k) {
i__2 = *m - k;
/* Computing MIN */
i__3 = k + 1;
/* Computing MIN */
i__4 = k + 1;
z__1 = zdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
min(i__4,*m) + l * c_dim1], &c__1);
suml.r = z__1.r, suml.i = z__1.i;
i__2 = l - 1;
z__1 = zdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
, &c__1);
sumr.r = z__1.r, sumr.i = z__1.i;
i__2 = k + l * c_dim1;
z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i;
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i;
vec.r = z__1.r, vec.i = z__1.i;
scaloc = 1.;
i__2 = k + k * a_dim1;
i__3 = l + l * b_dim1;
z__2.r = sgn * b[i__3].r, z__2.i = sgn * b[i__3].i;
z__1.r = a[i__2].r + z__2.r, z__1.i = a[i__2].i + z__2.i;
a11.r = z__1.r, a11.i = z__1.i;
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
d__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.;
da11 = smin;
*info = 1;
}
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
d__2));
if (da11 < 1. && db > 1.) {
if (db > bignum * da11) {
scaloc = 1. / db;
}
}
z__3.r = scaloc, z__3.i = 0.;
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
z__3.i + vec.i * z__3.r;
z__1 = zladiv_(&z__2, &a11);
x11.r = z__1.r, x11.i = z__1.i;
if (scaloc != 1.) {
i__2 = *n;
for (j = 1; j <= i__2; ++j) {
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L10: */
}
*scale *= scaloc;
}
i__2 = k + l * c_dim1;
c__[i__2].r = x11.r, c__[i__2].i = x11.i;
/* L20: */
}
/* L30: */
}
} else if (! notrna && notrnb) {
i__1 = *n;
for (l = 1; l <= i__1; ++l) {
i__2 = *m;
for (k = 1; k <= i__2; ++k) {
i__3 = k - 1;
z__1 = zdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
c_dim1 + 1], &c__1);
suml.r = z__1.r, suml.i = z__1.i;
i__3 = l - 1;
z__1 = zdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
, &c__1);
sumr.r = z__1.r, sumr.i = z__1.i;
i__3 = k + l * c_dim1;
z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i;
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
z__1.r = c__[i__3].r - z__2.r, z__1.i = c__[i__3].i - z__2.i;
vec.r = z__1.r, vec.i = z__1.i;
scaloc = 1.;
d_cnjg(&z__2, &a[k + k * a_dim1]);
i__3 = l + l * b_dim1;
z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i;
z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
a11.r = z__1.r, a11.i = z__1.i;
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
d__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.;
da11 = smin;
*info = 1;
}
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
d__2));
if (da11 < 1. && db > 1.) {
if (db > bignum * da11) {
scaloc = 1. / db;
}
}
z__3.r = scaloc, z__3.i = 0.;
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
z__3.i + vec.i * z__3.r;
z__1 = zladiv_(&z__2, &a11);
x11.r = z__1.r, x11.i = z__1.i;
if (scaloc != 1.) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L40: */
}
*scale *= scaloc;
}
i__3 = k + l * c_dim1;
c__[i__3].r = x11.r, c__[i__3].i = x11.i;
/* L50: */
}
/* L60: */
}
} else if (! notrna && ! notrnb) {
for (l = *n; l >= 1; --l) {
i__1 = *m;
for (k = 1; k <= i__1; ++k) {
i__2 = k - 1;
z__1 = zdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
c_dim1 + 1], &c__1);
suml.r = z__1.r, suml.i = z__1.i;
i__2 = *n - l;
/* Computing MIN */
i__3 = l + 1;
/* Computing MIN */
i__4 = l + 1;
z__1 = zdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
l + min(i__4,*n) * b_dim1], ldb);
sumr.r = z__1.r, sumr.i = z__1.i;
i__2 = k + l * c_dim1;
d_cnjg(&z__4, &sumr);
z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i;
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i;
vec.r = z__1.r, vec.i = z__1.i;
scaloc = 1.;
i__2 = k + k * a_dim1;
i__3 = l + l * b_dim1;
z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i;
z__2.r = a[i__2].r + z__3.r, z__2.i = a[i__2].i + z__3.i;
d_cnjg(&z__1, &z__2);
a11.r = z__1.r, a11.i = z__1.i;
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
d__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.;
da11 = smin;
*info = 1;
}
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
d__2));
if (da11 < 1. && db > 1.) {
if (db > bignum * da11) {
scaloc = 1. / db;
}
}
z__3.r = scaloc, z__3.i = 0.;
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
z__3.i + vec.i * z__3.r;
z__1 = zladiv_(&z__2, &a11);
x11.r = z__1.r, x11.i = z__1.i;
if (scaloc != 1.) {
i__2 = *n;
for (j = 1; j <= i__2; ++j) {
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L70: */
}
*scale *= scaloc;
}
i__2 = k + l * c_dim1;
c__[i__2].r = x11.r, c__[i__2].i = x11.i;
/* L80: */
}
/* L90: */
}
} else if (notrna && ! notrnb) {
for (l = *n; l >= 1; --l) {
for (k = *m; k >= 1; --k) {
i__1 = *m - k;
/* Computing MIN */
i__2 = k + 1;
/* Computing MIN */
i__3 = k + 1;
z__1 = zdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
min(i__3,*m) + l * c_dim1], &c__1);
suml.r = z__1.r, suml.i = z__1.i;
i__1 = *n - l;
/* Computing MIN */
i__2 = l + 1;
/* Computing MIN */
i__3 = l + 1;
z__1 = zdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
l + min(i__3,*n) * b_dim1], ldb);
sumr.r = z__1.r, sumr.i = z__1.i;
i__1 = k + l * c_dim1;
d_cnjg(&z__4, &sumr);
z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i;
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
z__1.r = c__[i__1].r - z__2.r, z__1.i = c__[i__1].i - z__2.i;
vec.r = z__1.r, vec.i = z__1.i;
scaloc = 1.;
i__1 = k + k * a_dim1;
d_cnjg(&z__3, &b[l + l * b_dim1]);
z__2.r = sgn * z__3.r, z__2.i = sgn * z__3.i;
z__1.r = a[i__1].r + z__2.r, z__1.i = a[i__1].i + z__2.i;
a11.r = z__1.r, a11.i = z__1.i;
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
d__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.;
da11 = smin;
*info = 1;
}
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
d__2));
if (da11 < 1. && db > 1.) {
if (db > bignum * da11) {
scaloc = 1. / db;
}
}
z__3.r = scaloc, z__3.i = 0.;
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
z__3.i + vec.i * z__3.r;
z__1 = zladiv_(&z__2, &a11);
x11.r = z__1.r, x11.i = z__1.i;
if (scaloc != 1.) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L100: */
}
*scale *= scaloc;
}
i__1 = k + l * c_dim1;
c__[i__1].r = x11.r, c__[i__1].i = x11.i;
/* L110: */
}
/* L120: */
}
}
return;
}

107
relapack/src/ztrtri.c Normal file
View File

@@ -0,0 +1,107 @@
#include "relapack.h"
static void RELAPACK_ztrtri_rec(const char *, const char *, const int *,
double *, const int *, int *);
/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
*
* This routine is functionally equivalent to LAPACK's ztrtri.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html
* */
void RELAPACK_ztrtri(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int nounit = LAPACK(lsame)(diag, "N");
const int unit = LAPACK(lsame)(diag, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (!nounit && !unit)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZTRTRI", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleandiag = nounit ? 'N' : 'U';
// check for singularity
if (nounit) {
int i;
for (i = 0; i < *n; i++)
if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
*info = i;
return;
}
}
// Recursive kernel
RELAPACK_ztrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
}
/** ztrtri's recursive compute kernel */
static void RELAPACK_ztrtri_rec(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) {
// Unblocked
LAPACK(ztrti2)(uplo, diag, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_ztrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = - A_BL * A_TL
BLAS(ztrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
// A_BL = A_BR \ A_BL
BLAS(ztrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TR = - A_TL * A_TR
BLAS(ztrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
// A_TR = A_TR / A_BR
BLAS(ztrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_ztrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}