Add Elmar Peise's ReLAPACK

This commit is contained in:
Martin Kroeker 2017-06-28 17:38:41 +02:00 committed by GitHub
parent 482015f8d6
commit 9b7b5f7fdc
82 changed files with 20579 additions and 0 deletions

22
relapack/LICENSE Normal file
View File

@ -0,0 +1,22 @@
The MIT License (MIT)
Copyright (c) 2016 Elmar Peise
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

64
relapack/Makefile Normal file
View File

@ -0,0 +1,64 @@
TOPDIR = ..
include $(TOPDIR)/Makefile.system
SRC = $(wildcard src/*.c)
OBJS = $(SRC:%.c=%.o)
TEST_SUITS = \
slauum dlauum clauum zlauum \
spotrf dpotrf cpotrf zpotrf \
spbtrf dpbtrf cpbtrf zpbtrf \
ssygst dsygst chegst zhegst \
ssytrf dsytrf csytrf chetrf zsytrf zhetrf \
sgetrf dgetrf cgetrf zgetrf \
sgbtrf dgbtrf cgbtrf zgbtrf \
strsyl dtrsyl ctrsyl ztrsyl \
stgsyl dtgsyl ctgsyl ztgsyl \
sgemmt dgemmt cgemmt zgemmt
TESTS = $(TEST_SUITS:%=test/%.pass) # dummies
TEST_EXES = $(TEST_SUITS:%=test/%.x)
LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm
.SECONDARY: $(TEST_EXES)
.PHONY: test
# ReLAPACK compilation
libs: $(OBJS)
@echo "Building ReLAPACK library $(LIBNAME)"
$(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS)
$(RANLIB) $(TOPDIR)/$(LIBNAME)
%.o: %.c config.h
$(CC) $(CFLAGS) -c $< -o $@
# ReLAPACK testing
test: $(TEST_EXES) $(TESTS)
@echo "passed all tests"
test/%.pass: test/%.x
@echo -n $*:
@./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<)
test/s%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
$(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
test/d%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
$(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
test/c%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
$(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
test/z%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
$(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
# cleaning up
clean:
rm -f $(OBJS) test/util.o test/*.x

68
relapack/README.md Normal file
View File

@ -0,0 +1,68 @@
ReLAPACK
========
[![Build Status](https://travis-ci.org/HPAC/ReLAPACK.svg?branch=master)](https://travis-ci.org/HPAC/ReLAPACK)
[Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK)
ReLAPACK offers a collection of recursive algorithms for many of LAPACK's
compute kernels. Since it preserves LAPACK's established interfaces, ReLAPACK
integrates effortlessly into existing application codes. ReLAPACK's routines
not only outperform the reference LAPACK but also improve upon the performance
of tuned implementations, such as OpenBLAS and MKL.
Coverage
--------
For a detailed list of covered operations and an overview of operations to which
recursion is not efficiently applicable, see [coverage.md](coverage.md).
Installation
------------
To compile with the default configuration, simply run `make` to create the
library `librelapack.a`.
### Linking with MKL
Note that to link with MKL, you currently need to set the flag
`COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and
`ztrsyl`. For further configuration options see [config.md](config.md).
### Dependencies
ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked
kernels from [LAPACK](http://www.netlib.org/lapack/). There are many optimized
and machine specific implementations of these libraries, which are commonly
provided by hardware vendors or available as open source (e.g.,
[OpenBLAS](http://www.openblas.net/)).
Testing
-------
ReLAPACK's test suite compares its routines numerically with LAPACK's
counterparts. To set up the tests (located int `test/`) you need to specify
link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then
`make test` runs the tests. For details on the performed tests, see
[test/README.md](test/README.md).
Examples
--------
Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the
covered routines applies directly to ReLAPACK. A few separate examples are
given in `examples/`. For details, see [examples/README.md](examples/README.md).
Citing
------
When referencing ReLAPACK, please cite the preprint of the paper
[Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763):
@article{relapack,
author = {Elmar Peise and Paolo Bientinesi},
title = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection},
journal = {CoRR},
volume = {abs/1602.06763},
year = {2016},
url = {http://arxiv.org/abs/1602.06763},
}

208
relapack/config.h Normal file
View File

@ -0,0 +1,208 @@
#ifndef RELAPACK_CONFIG_H
#define RELAPACK_CONFIG_H
// ReLAPACK configuration file.
// See also config.md
///////////////////////////////
// BLAS/LAPACK obect symbols //
///////////////////////////////
// BLAS routines linked against have a trailing underscore
#define BLAS_UNDERSCORE 1
// LAPACK routines linked against have a trailing underscore
#define LAPACK_UNDERSCORE BLAS_UNDERSCORE
// Complex BLAS/LAPACK routines return their result in the first argument
// This option must be enabled when linking to MKL for ctrsyl and ztrsyl to
// work.
#define COMPLEX_FUNCTIONS_AS_ROUTINES 0
#ifdef F_INTERFACE_INTEL
#define COMPLEX_FUNCTIONS_AS_ROUTINES 1
#endif
#define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
#define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
// The BLAS-like extension xgemmt is provided by an external library.
#define HAVE_XGEMMT 0
////////////////////////////
// Use malloc in ReLAPACK //
////////////////////////////
#define ALLOW_MALLOC 1
// allow malloc in xsygst for improved performance
#define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
// allow malloc in xsytrf if the passed work buffer is too small
#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
////////////////////////////////
// LAPACK routine replacement //
////////////////////////////////
// The following macros specify which routines are included in the library under
// LAPACK's symbol names: 1 included, 0 not included
#define INCLUDE_ALL 1
#define INCLUDE_XLAUUM INCLUDE_ALL
#define INCLUDE_SLAUUM INCLUDE_XLAUUM
#define INCLUDE_DLAUUM INCLUDE_XLAUUM
#define INCLUDE_CLAUUM INCLUDE_XLAUUM
#define INCLUDE_ZLAUUM INCLUDE_XLAUUM
#define INCLUDE_XSYGST INCLUDE_ALL
#define INCLUDE_SSYGST INCLUDE_XSYGST
#define INCLUDE_DSYGST INCLUDE_XSYGST
#define INCLUDE_CHEGST INCLUDE_XSYGST
#define INCLUDE_ZHEGST INCLUDE_XSYGST
#define INCLUDE_XTRTRI INCLUDE_ALL
#define INCLUDE_STRTRI INCLUDE_XTRTRI
#define INCLUDE_DTRTRI INCLUDE_XTRTRI
#define INCLUDE_CTRTRI INCLUDE_XTRTRI
#define INCLUDE_ZTRTRI INCLUDE_XTRTRI
#define INCLUDE_XPOTRF INCLUDE_ALL
#define INCLUDE_SPOTRF INCLUDE_XPOTRF
#define INCLUDE_DPOTRF INCLUDE_XPOTRF
#define INCLUDE_CPOTRF INCLUDE_XPOTRF
#define INCLUDE_ZPOTRF INCLUDE_XPOTRF
#define INCLUDE_XPBTRF INCLUDE_ALL
#define INCLUDE_SPBTRF INCLUDE_XPBTRF
#define INCLUDE_DPBTRF INCLUDE_XPBTRF
#define INCLUDE_CPBTRF INCLUDE_XPBTRF
#define INCLUDE_ZPBTRF INCLUDE_XPBTRF
#define INCLUDE_XSYTRF INCLUDE_ALL
#define INCLUDE_SSYTRF INCLUDE_XSYTRF
#define INCLUDE_DSYTRF INCLUDE_XSYTRF
#define INCLUDE_CSYTRF INCLUDE_XSYTRF
#define INCLUDE_CHETRF INCLUDE_XSYTRF
#define INCLUDE_ZSYTRF INCLUDE_XSYTRF
#define INCLUDE_ZHETRF INCLUDE_XSYTRF
#define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF
#define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF
#define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF
#define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF
#define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF
#define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF
#define INCLUDE_XGETRF INCLUDE_ALL
#define INCLUDE_SGETRF INCLUDE_XGETRF
#define INCLUDE_DGETRF INCLUDE_XGETRF
#define INCLUDE_CGETRF INCLUDE_XGETRF
#define INCLUDE_ZGETRF INCLUDE_XGETRF
#define INCLUDE_XGBTRF INCLUDE_ALL
#define INCLUDE_SGBTRF INCLUDE_XGBTRF
#define INCLUDE_DGBTRF INCLUDE_XGBTRF
#define INCLUDE_CGBTRF INCLUDE_XGBTRF
#define INCLUDE_ZGBTRF INCLUDE_XGBTRF
#define INCLUDE_XTRSYL INCLUDE_ALL
#define INCLUDE_STRSYL INCLUDE_XTRSYL
#define INCLUDE_DTRSYL INCLUDE_XTRSYL
#define INCLUDE_CTRSYL INCLUDE_XTRSYL
#define INCLUDE_ZTRSYL INCLUDE_XTRSYL
#define INCLUDE_XTGSYL INCLUDE_ALL
#define INCLUDE_STGSYL INCLUDE_XTGSYL
#define INCLUDE_DTGSYL INCLUDE_XTGSYL
#define INCLUDE_CTGSYL INCLUDE_XTGSYL
#define INCLUDE_ZTGSYL INCLUDE_XTGSYL
#define INCLUDE_XGEMMT 0
#define INCLUDE_SGEMMT INCLUDE_XGEMMT
#define INCLUDE_DGEMMT INCLUDE_XGEMMT
#define INCLUDE_CGEMMT INCLUDE_XGEMMT
#define INCLUDE_ZGEMMT INCLUDE_XGEMMT
/////////////////////
// crossover sizes //
/////////////////////
// default crossover size
#define CROSSOVER 24
// individual crossover sizes
#define CROSSOVER_XLAUUM CROSSOVER
#define CROSSOVER_SLAUUM CROSSOVER_XLAUUM
#define CROSSOVER_DLAUUM CROSSOVER_XLAUUM
#define CROSSOVER_CLAUUM CROSSOVER_XLAUUM
#define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM
#define CROSSOVER_XSYGST CROSSOVER
#define CROSSOVER_SSYGST CROSSOVER_XSYGST
#define CROSSOVER_DSYGST CROSSOVER_XSYGST
#define CROSSOVER_CHEGST CROSSOVER_XSYGST
#define CROSSOVER_ZHEGST CROSSOVER_XSYGST
#define CROSSOVER_XTRTRI CROSSOVER
#define CROSSOVER_STRTRI CROSSOVER_XTRTRI
#define CROSSOVER_DTRTRI CROSSOVER_XTRTRI
#define CROSSOVER_CTRTRI CROSSOVER_XTRTRI
#define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI
#define CROSSOVER_XPOTRF CROSSOVER
#define CROSSOVER_SPOTRF CROSSOVER_XPOTRF
#define CROSSOVER_DPOTRF CROSSOVER_XPOTRF
#define CROSSOVER_CPOTRF CROSSOVER_XPOTRF
#define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF
#define CROSSOVER_XPBTRF CROSSOVER
#define CROSSOVER_SPBTRF CROSSOVER_XPBTRF
#define CROSSOVER_DPBTRF CROSSOVER_XPBTRF
#define CROSSOVER_CPBTRF CROSSOVER_XPBTRF
#define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF
#define CROSSOVER_XSYTRF CROSSOVER
#define CROSSOVER_SSYTRF CROSSOVER_XSYTRF
#define CROSSOVER_DSYTRF CROSSOVER_XSYTRF
#define CROSSOVER_CSYTRF CROSSOVER_XSYTRF
#define CROSSOVER_CHETRF CROSSOVER_XSYTRF
#define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF
#define CROSSOVER_ZHETRF CROSSOVER_XSYTRF
#define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF
#define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF
#define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF
#define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF
#define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF
#define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF
#define CROSSOVER_XGETRF CROSSOVER
#define CROSSOVER_SGETRF CROSSOVER_XGETRF
#define CROSSOVER_DGETRF CROSSOVER_XGETRF
#define CROSSOVER_CGETRF CROSSOVER_XGETRF
#define CROSSOVER_ZGETRF CROSSOVER_XGETRF
#define CROSSOVER_XGBTRF CROSSOVER
#define CROSSOVER_SGBTRF CROSSOVER_XGBTRF
#define CROSSOVER_DGBTRF CROSSOVER_XGBTRF
#define CROSSOVER_CGBTRF CROSSOVER_XGBTRF
#define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF
#define CROSSOVER_XTRSYL CROSSOVER
#define CROSSOVER_STRSYL CROSSOVER_XTRSYL
#define CROSSOVER_DTRSYL CROSSOVER_XTRSYL
#define CROSSOVER_CTRSYL CROSSOVER_XTRSYL
#define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL
#define CROSSOVER_XTGSYL CROSSOVER
#define CROSSOVER_STGSYL CROSSOVER_XTGSYL
#define CROSSOVER_DTGSYL CROSSOVER_XTGSYL
#define CROSSOVER_CTGSYL CROSSOVER_XTGSYL
#define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL
// sytrf helper routine
#define CROSSOVER_XGEMMT CROSSOVER_XSYTRF
#define CROSSOVER_SGEMMT CROSSOVER_XGEMMT
#define CROSSOVER_DGEMMT CROSSOVER_XGEMMT
#define CROSSOVER_CGEMMT CROSSOVER_XGEMMT
#define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT
#endif /* RELAPACK_CONFIG_H */

87
relapack/config.md Normal file
View File

@ -0,0 +1,87 @@
RELAPACK Configuration
======================
ReLAPACK has two configuration files: `make.inc`, which is included by the
Makefile, and `config.h` which is included in the source files.
Build and Testing Environment
-----------------------------
The build environment (compiler and flags) and the test configuration (linker
flags for BLAS and LAPACK) are specified in `make.inc`. The test matrix size
and error bounds are defined in `test/config.h`.
The library `librelapack.a` is compiled by invoking `make`. The tests are
performed by either `make test` or calling `make` in the test folder.
BLAS/LAPACK complex function interfaces
---------------------------------------
For BLAS and LAPACK functions that return a complex number, there exist two
conflicting (FORTRAN compiler dependent) calling conventions: either the result
is returned as a `struct` of two floating point numbers or an additional first
argument with a pointer to such a `struct` is used. By default ReLAPACK uses
the former (which is what gfortran uses), but it can switch to the latter by
setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK
specific counterparts) to `1` in `config.h`.
**For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.**
(Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases
will segfault or return errors on the order of 1 or larger.)
BLAS extension `xgemmt`
-----------------------
The LDL decompositions require a general matrix-matrix product that updates only
a triangular matrix called `xgemmt`. If the BLAS implementation linked against
provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`;
otherwise, ReLAPACK uses its own recursive implementation of these kernels.
`xgemmt` is provided by MKL.
Routine Selection
-----------------
ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`). If the
corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to
`1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g.,
`dgetrf_`). By default, wrappers for all routines are enabled.
Crossover Size
--------------
The crossover size determines below which matrix sizes ReLAPACK's recursive
algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3
routines. The crossover size is set in `config.h` and can be chosen either
globally for the entire library, by operation, or individually by routine.
Allowing Temporary Buffers
--------------------------
Two of ReLAPACK's routines make use of temporary buffers, which are allocated
and freed within ReLAPACK. Setting `ALLOW_MALLOC` (or one of the routine
specific counterparts) to 0 in `config.h` will disable these buffers. The
affected routines are:
* `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2. As in
LAPACK, this size can be queried by setting `lWork = -1` and the passed
buffer will be used if it is large enough; only if it is not, a local buffer
will be allocated.
The advantage of this mechanism is that ReLAPACK will seamlessly work even
with codes that statically provide too little memory instead of breaking
them.
* `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem
to standard form can use an auxiliary buffer of size n^2 / 2 to avoid
redundant computations. It thereby performs about 30% less FLOPs than
LAPACK.
FORTRAN symbol names
--------------------
ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces.
Since these libraries usually have an underscore to their symbol names, ReLAPACK
has configuration switches in `config.h` to adjust the corresponding routine
names.

212
relapack/coverage.md Normal file
View File

@ -0,0 +1,212 @@
Coverage of ReLAPACK
====================
This file lists all LAPACK compute routines that are covered by recursive
algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which
are not (yet) part of ReLAPACK.
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
- [List of covered LAPACK routines](#list-of-covered-lapack-routines)
- [`xlauum`](#xlauum)
- [`xsygst`](#xsygst)
- [`xtrtri`](#xtrtri)
- [`xpotrf`](#xpotrf)
- [`xpbtrf`](#xpbtrf)
- [`xsytrf`](#xsytrf)
- [`xgetrf`](#xgetrf)
- [`xgbtrf`](#xgbtrf)
- [`xtrsyl`](#xtrsyl)
- [`xtgsyl`](#xtgsyl)
- [Covered BLAS extension](#covered-blas-extension)
- [`xgemmt`](#xgemmt)
- [Not covered yet](#not-covered-yet)
- [`xpstrf`](#xpstrf)
- [Not covered: extra FLOPs](#not-covered-extra-flops)
- [QR decomposition (and related)](#qr-decomposition-and-related)
- [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal)
- [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal)
- [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg)
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
List of covered LAPACK routines
-------------------------------
### `xlauum`
Multiplication of a triangular matrix with its (complex conjugate) transpose,
resulting in a symmetric (Hermitian) matrix.
Routines: `slauum`, `dlauum`, `clauum`, `zlauum`
Operations:
* A = L^T L
* A = U U^T
### `xsygst`
Simultaneous two-sided multiplication of a symmetric matrix with a triangular
matrix and its transpose
Routines: `ssygst`, `dsygst`, `chegst`, `zhegst`
Operations:
* A = inv(L) A inv(L^T)
* A = inv(U^T) A inv(U)
* A = L^T A L
* A = U A U^T
### `xtrtri`
Inversion of a triangular matrix
Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri`
Operations:
* L = inv(L)
* U = inv(U)
### `xpotrf`
Cholesky decomposition of a symmetric (Hermitian) positive definite matrix
Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf`
Operations:
* L L^T = A
* U^T U = A
### `xpbtrf`
Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix
Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf`
Operations:
* L L^T = A
* U^T U = A
### `xsytrf`
LDL decomposition of a symmetric (or Hermitian) matrix
Routines:
* `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`,
* `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`,
`zhetrf_rook`
Operations:
* L D L^T = A
* U^T D U = A
### `xgetrf`
LU decomposition of a general matrix with pivoting
Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf`
Operation: P L U = A
### `xgbtrf`
LU decomposition of a general banded matrix with pivoting
Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf`
Operation: L U = A
### `xtrsyl`
Solution of the quasi-triangular Sylvester equation
Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl`
Operations:
* A X + B Y = C -> X
* A^T X + B Y = C -> X
* A X + B^T Y = C -> X
* A^T X + B^T Y = C -> X
* A X - B Y = C -> X
* A^T X - B Y = C -> X
* A X - B^T Y = C -> X
* A^T X - B^T Y = C -> X
### `xtgsyl`
Solution of the generalized Sylvester equations
Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl`
Operations:
* A R - L B = C, D R - L E = F -> L, R
* A^T R + D^T L = C, R B^T - L E^T = -F -> L, R
Covered BLAS extension
----------------------
### `xgemmt`
Matrix-matrix product updating only a triangular part of the result
Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt`
Operations:
* C = alpha A B + beta C
* C = alpha A B^T + beta C
* C = alpha A^T B + beta C
* C = alpha A^T B^T + beta C
Not covered yet
---------------
The following operation is implemented as a blocked algorithm in LAPACK but
currently not yet covered in ReLAPACK as a recursive algorithm
### `xpstrf`
Cholesky decomposition of a positive semi-definite matrix with complete pivoting.
Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf`
Operations:
* P L L^T P^T = A
* P U^T U P^T = A
Not covered: extra FLOPs
------------------------
The following routines are not covered because recursive variants would require
considerably more FLOPs or operate on banded matrices.
### QR decomposition (and related)
Routines:
* `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf`
* `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf`
* `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf`
* `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf`
* `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf`
Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A
Routines for multiplication with Q:
* `sormqr`, `dormqr`, `cunmqr`, `zunmqr`
* `sormrq`, `dormrq`, `cunmrq`, `zunmrq`
* `sormql`, `dormql`, `cunmql`, `zunmql`
* `sormlq`, `dormlq`, `cunmlq`, `zunmlq`
* `sormrz`, `dormrz`, `cunmrz`, `zunmrz`
Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T
Routines for construction of Q:
* `sorgqr`, `dorgqr`, `cungqr`, `zungqr`
* `sorgrq`, `dorgrq`, `cungrq`, `zungrq`
* `sorgql`, `dorgql`, `cungql`, `zungql`
* `sorglq`, `dorglq`, `cunglq`, `zunglq`
### Symmetric reduction to tridiagonal
Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd`
Operation: Q T Q^T = A
### Symmetric reduction to bidiagonal
Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd`
Operation: Q T P^T = A
### Reduction to upper Hessenberg
Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd`
Operation: Q H Q^T = A

67
relapack/inc/relapack.h Normal file
View File

@ -0,0 +1,67 @@
#ifndef RELAPACK_H
#define RELAPACK_H
void RELAPACK_slauum(const char *, const int *, float *, const int *, int *);
void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *);
void RELAPACK_clauum(const char *, const int *, float *, const int *, int *);
void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *);
void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *);
void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *);
void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *);
void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *);
void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *);
void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *);
void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *);
void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *);
void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *);
void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *);
void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *);
void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *);
void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *);
void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *);
void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *);
void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *);
void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
#endif /* RELAPACK_H */

61
relapack/src/blas.h Normal file
View File

@ -0,0 +1,61 @@
#ifndef BLAS_H
#define BLAS_H
extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *);
extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *);
extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *);
extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *);
extern void BLAS(sscal)(const int *, const float *, float *, const int *);
extern void BLAS(dscal)(const int *, const double *, double *, const int *);
extern void BLAS(cscal)(const int *, const float *, float *, const int *);
extern void BLAS(zscal)(const int *, const double *, double *, const int *);
extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
#if HAVE_XGEMMT
extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
#endif
#endif /* BLAS_H */

230
relapack/src/cgbtrf.c Normal file
View File

@ -0,0 +1,230 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *,
const int *, float *, const int *, int *, float *, const int *, float *,
const int *, int *);
/** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's cgbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html
* */
void RELAPACK_cgbtrf(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kl < 0)
*info = -3;
else if (*ku < 0)
*info = -4;
else if (*ldAb < 2 * *kl + *ku + 1)
*info = -6;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CGBTRF", &minfo);
return;
}
// Constant
const float ZERO[] = { 0., 0. };
// Result upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + 2 * kv;
// Zero upper diagonal fill-in elements
int i, j;
for (j = 0; j < *n; j++) {
float *const A_j = A + 2 * *ldA * j;
for (i = MAX(0, j - kv); i < j - *ku; i++)
A_j[2 * i] = A_j[2 * i + 1] = 0.;
}
// Allocate work space
const int n1 = CREC_SPLIT(*n);
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const int nWorkl = (kv > n1) ? n1 : kv;
const int mWorku = (*kl > n1) ? n1 : *kl;
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float));
float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float));
LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
// Free work space
free(Workl);
free(Worku);
}
/** cgbtrf's recursive compute kernel */
static void RELAPACK_cgbtrf_rec(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
int *info
) {
if (*n <= MAX(CROSSOVER_CGBTRF, 1)) {
// Unblocked
LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Loop iterators
int i, j;
// Output upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + 2 * kv;
// Splitting
const int n1 = MIN(CREC_SPLIT(*n), *kl);
const int n2 = *n - n1;
const int m1 = MIN(n1, *m);
const int m2 = *m - m1;
const int mn1 = MIN(m1, n1);
const int mn2 = MIN(m2, n2);
// Ab_L *
// Ab_BR
float *const Ab_L = Ab;
float *const Ab_BR = Ab + 2 * *ldAb * n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + 2 * *ldA * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * m1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * m1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// Banded splitting
const int n21 = MIN(n2, kv - n1);
const int n22 = MIN(n2 - n21, n1);
const int m21 = MIN(m2, *kl - m1);
const int m22 = MIN(m2 - m21, m1);
// n1 n21 n22
// m * A_Rl ARr
float *const A_Rl = A_R;
float *const A_Rr = A_R + 2 * *ldA * n21;
// n1 n21 n22
// m1 * A_TRl A_TRr
// m21 A_BLt A_BRtl A_BRtr
// m22 A_BLb A_BRbl A_BRbr
float *const A_TRl = A_TR;
float *const A_TRr = A_TR + 2 * *ldA * n21;
float *const A_BLt = A_BL;
float *const A_BLb = A_BL + 2 * m21;
float *const A_BRtl = A_BR;
float *const A_BRtr = A_BR + 2 * *ldA * n21;
float *const A_BRbl = A_BR + 2 * m21;
float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
// Workl = A_BLb
LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
// partially redo swaps in A_L
for (i = 0; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
else
BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
}
}
// apply pivots to A_Rl
LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
// apply pivots to A_Rr columnwise
for (j = 0; j < n22; j++) {
float *const A_Rrj = A_Rr + 2 * *ldA * j;
for (i = j; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
const float tmpr = A_Rrj[2 * i];
const float tmpc = A_Rrj[2 * i + 1];
A_Rrj[2 * i] = A_Rrj[2 * ip];
A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1];
A_Rrj[2 * ip] = tmpr;
A_Rrj[2 * ip + 1] = tmpc;
}
}
}
// A_TRl = A_TL \ A_TRl
BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// Worku = A_TRr
LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
// Worku = A_TL \ Worku
BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
// A_TRr = Worku
LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
// A_BRtl = A_BRtl - A_BLt * A_TRl
BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
// A_BRbl = A_BRbl - Workl * A_TRl
BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
// A_BRtr = A_BRtr - A_BLt * Worku
BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Workl * Worku
BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
// partially undo swaps in A_L
for (i = mn1 - 1; i >= 0; i--) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
else
BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
if (*info)
*info += n1;
// shift pivots
for (i = 0; i < mn2; i++)
ipiv_B[i] += n1;
}

167
relapack/src/cgemmt.c Normal file
View File

@ -0,0 +1,167 @@
#include "relapack.h"
static void RELAPACK_cgemmt_rec(const char *, const char *, const char *,
const int *, const int *, const float *, const float *, const int *,
const float *, const int *, const float *, float *, const int *);
static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *,
const int *, const int *, const float *, const float *, const int *,
const float *, const int *, const float *, float *, const int *);
/** CGEMMT computes a matrix-matrix product with general matrices but updates
* only the upper or lower triangular part of the result matrix.
*
* This routine performs the same operation as the BLAS routine
* cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
* but only updates the triangular part of C specified by uplo:
* If (*uplo == 'L'), only the lower triangular part of C is updated,
* otherwise the upper triangular part is updated.
* */
void RELAPACK_cgemmt(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
#if HAVE_XGEMMT
BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
#else
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int notransA = LAPACK(lsame)(transA, "N");
const int tranA = LAPACK(lsame)(transA, "T");
const int ctransA = LAPACK(lsame)(transA, "C");
const int notransB = LAPACK(lsame)(transB, "N");
const int tranB = LAPACK(lsame)(transB, "T");
const int ctransB = LAPACK(lsame)(transB, "C");
int info = 0;
if (!lower && !upper)
info = 1;
else if (!tranA && !ctransA && !notransA)
info = 2;
else if (!tranB && !ctransB && !notransB)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*ldA < MAX(1, notransA ? *n : *k))
info = 8;
else if (*ldB < MAX(1, notransB ? *k : *n))
info = 10;
else if (*ldC < MAX(1, *n))
info = 13;
if (info) {
LAPACK(xerbla)("CGEMMT", &info);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');
// Recursive kernel
RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
#endif
}
/** cgemmt's recursive compute kernel */
static void RELAPACK_cgemmt_rec(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
if (*n <= MAX(CROSSOVER_CGEMMT, 1)) {
// Unblocked
RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
}
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_T
// A_B
const float *const A_T = A;
const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);
// B_L B_R
const float *const B_L = B;
const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);
// C_TL C_TR
// C_BL C_BR
float *const C_TL = C;
float *const C_TR = C + 2 * *ldC * n1;
float *const C_BL = C + 2 * n1;
float *const C_BR = C + 2 * *ldC * n1 + 2 * n1;
// recursion(C_TL)
RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
if (*uplo == 'L')
// C_BL = alpha A_B B_L + beta C_BL
BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
else
// C_TR = alpha A_T B_R + beta C_TR
BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
// recursion(C_BR)
RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
}
/** cgemmt's unblocked compute kernel */
static void RELAPACK_cgemmt_rec2(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
const int incB = (*transB == 'N') ? 1 : *ldB;
const int incC = 1;
int i;
for (i = 0; i < *n; i++) {
// A_0
// A_i
const float *const A_0 = A;
const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);
// * B_i *
const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);
// * C_0i *
// * C_ii *
float *const C_0i = C + 2 * *ldC * i;
float *const C_ii = C + 2 * *ldC * i + 2 * i;
if (*uplo == 'L') {
const int nmi = *n - i;
if (*transA == 'N')
BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
else
BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
} else {
const int ip1 = i + 1;
if (*transA == 'N')
BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
else
BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
}
}
}

117
relapack/src/cgetrf.c Normal file
View File

@ -0,0 +1,117 @@
#include "relapack.h"
static void RELAPACK_cgetrf_rec(const int *, const int *, float *,
const int *, int *, int *);
/** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's cgetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html
*/
void RELAPACK_cgetrf(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CGETRF", &minfo);
return;
}
const int sn = MIN(*m, *n);
RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
if (*m < *n) {
// Constants
const float ONE[] = { 1., 0. };
const int iONE[] = { 1 };
// Splitting
const int rn = *n - *m;
// A_L A_R
const float *const A_L = A;
float *const A_R = A + 2 * *ldA * *m;
// A_R = apply(ipiv, A_R)
LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
// A_R = A_L \ A_R
BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
}
}
/** cgetrf's recursive compute kernel */
static void RELAPACK_cgetrf_rec(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
if (*n <= MAX(CROSSOVER_CGETRF, 1)) {
// Unblocked
LAPACK(cgetf2)(m, n, A, ldA, ipiv, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
const int m2 = *m - n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + 2 * *ldA * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// recursion(A_L, ipiv_T)
RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
// apply pivots to A_R
LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
// A_TR = A_TL \ A_TR
BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_BL * A_TR
BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
// recursion(A_BR, ipiv_B)
RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
if (*info)
*info += n1;
// apply pivots to A_BL
LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
// shift pivots
int i;
for (i = 0; i < n2; i++)
ipiv_B[i] += n1;
}

212
relapack/src/chegst.c Normal file
View File

@ -0,0 +1,212 @@
#include "relapack.h"
#if XSYGST_ALLOW_MALLOC
#include "stdlib.h"
#endif
static void RELAPACK_chegst_rec(const int *, const char *, const int *,
float *, const int *, const float *, const int *,
float *, const int *, int *);
/** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
*
* This routine is functionally equivalent to LAPACK's chegst.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html
* */
void RELAPACK_chegst(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (*itype < 1 || *itype > 3)
*info = -1;
else if (!lower && !upper)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
else if (*ldB < MAX(1, *n))
*info = -7;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CHEGST", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Allocate work space
float *Work = NULL;
int lWork = 0;
#if XSYGST_ALLOW_MALLOC
const int n1 = CREC_SPLIT(*n);
lWork = n1 * (*n - n1);
Work = malloc(lWork * 2 * sizeof(float));
if (!Work)
lWork = 0;
#endif
// recursive kernel
RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
// Free work space
#if XSYGST_ALLOW_MALLOC
if (Work)
free(Work);
#endif
}
/** chegst's recursive compute kernel */
static void RELAPACK_chegst_rec(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
float *Work, const int *lWork, int *info
) {
if (*n <= MAX(CROSSOVER_CHEGST, 1)) {
// Unblocked
LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info);
return;
}
// Constants
const float ZERO[] = { 0., 0. };
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const float HALF[] = { .5, 0. };
const float MHALF[] = { -.5, 0. };
const int iONE[] = { 1 };
// Loop iterator
int i;
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// B_TL B_TR
// B_BL B_BR
const float *const B_TL = B;
const float *const B_TR = B + 2 * *ldB * n1;
const float *const B_BL = B + 2 * n1;
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// recursion(A_TL, B_TL)
RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
if (*itype == 1)
if (*uplo == 'L') {
// A_BL = A_BL / B_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * B_BL * A_TL
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
} else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR \ A_BL
BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL' \ A_TR
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * A_TL * B_TR
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
} else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR / B_BR
BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
else
if (*uplo == 'L') {
// A_BL = A_BL * B_TL
BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * A_BR * B_BL
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
} else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR * A_BL
BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL * A_TR
BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * B_TR * A_BR
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
} else
// A_TR = A_TR + 1/2 B_TR A_BR
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
else
// A_TR = A_TR + 1/2 B_TR * A_BR
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR * B_BR
BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
// recursion(A_BR, B_BR)
RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
}

236
relapack/src/chetrf.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's chetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html
* */
void RELAPACK_chetrf(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CHETRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** chetrf's recursive compute kernel */
static void RELAPACK_chetrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
// Unblocked
if (top) {
LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

520
relapack/src/chetrf_rec2.c Normal file
View File

@ -0,0 +1,520 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
*
* This routine is a minor modification of LAPACK's clahef.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int *
nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2, r__3, r__4;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k;
static float t, r1;
static complex d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static float alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen), ccopy_(int *, complex *, int *,
complex *, int *), cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float absakk;
extern /* Subroutine */ int clacgv_(int *, complex *, int *);
extern int icamax_(int *, complex *, int *);
extern /* Subroutine */ int csscal_(int *, float *, complex *, int
*);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
i__1 = k - 1;
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k + kw * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
i__1 = k + kw * w_dim1;
i__2 = k + kw * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - 1;
ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
i__1 = k - imax;
clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + (kw - 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (kw - 1) * w_dim1]), dabs(r__2));
if (imax > 1) {
i__1 = imax - 1;
jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
i__1 = jmax + (kw - 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (kw - 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kk - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
i__1 = kk - 1 - kp;
clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
r1 = 1.f / a[i__1].r;
i__1 = k - 1;
csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + kw * w_dim1], &q__2);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
q__2.r = t, q__2.i = 0.f;
c_div(&q__1, &q__2, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
r_cnjg(&q__2, &d21);
i__3 = j + kw * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
q__2.r * q__3.i + q__2.i * q__3.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k - 2;
clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
i__1 = k + k * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
w_dim1], &c__1);
}
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
i__1 = k + k * w_dim1;
i__2 = k + k * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = imax - k;
clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (imax < *n) {
i__1 = *n - imax;
ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
imax + 1 + (k + 1) * w_dim1], &c__1);
}
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + (k + 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (k + 1) * w_dim1]), dabs(r__2));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
i__1 = jmax + (k + 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (k + 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
k * w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k + kstep - 1;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kp - kk - 1;
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
i__1 = kp - kk - 1;
clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
r1 = 1.f / a[i__1].r;
i__1 = *n - k;
csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + k * w_dim1], &q__2);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
q__2.r = t, q__2.i = 0.f;
c_div(&q__1, &q__2, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
r_cnjg(&q__2, &d21);
i__3 = j + k * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
q__2.r * q__3.i + q__2.i * q__3.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = *n - k - 1;
clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/chetrf_rook.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's chetrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html
* */
void RELAPACK_chetrf_rook(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CHETRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** chetrf_rook's recursive compute kernel */
static void RELAPACK_chetrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
// Unblocked
if (top) {
LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@ -0,0 +1,661 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
*
* This routine is a minor modification of LAPACK's clahef_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, complex *a, int *lda, int *ipiv,
complex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2;
complex q__1, q__2, q__3, q__4, q__5;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k, p;
static float t, r1;
static complex d11, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static float alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen);
static float sfmin;
extern /* Subroutine */ int ccopy_(int *, complex *, int *,
complex *, int *);
static int itemp;
extern /* Subroutine */ int cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float stemp, absakk;
extern /* Subroutine */ int clacgv_(int *, complex *, int *);
extern int icamax_(int *, complex *, int *);
extern double slamch_(char *, ftnlen);
extern /* Subroutine */ int csscal_(int *, float *, complex *, int
*);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
sfmin = slamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
if (k > 1) {
i__1 = k - 1;
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
c__1);
}
i__1 = k + kw * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
i__1 = k + kw * w_dim1;
i__2 = k + kw * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
r__1 = w[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
if (k > 1) {
i__1 = k - 1;
ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
&c__1);
}
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
if (imax > 1) {
i__1 = imax - 1;
ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
}
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
i__1 = k - imax;
clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + (kw - 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
i__1 = itemp + (kw - 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (kw - 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = p + p * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = k - 1 - p;
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
i__1 = k - 1 - p;
clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
if (p > 1) {
i__1 = p - 1;
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kk - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
i__1 = kk - 1 - kp;
clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
t = a[i__1].r;
if (dabs(t) >= sfmin) {
r1 = 1.f / t;
i__1 = k - 1;
csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
i__3 = ii + k * a_dim1;
q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L14: */
}
}
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + kw * w_dim1], &q__2);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
r_cnjg(&q__5, &d21);
c_div(&q__2, &q__3, &q__5);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1;
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k - 2;
clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
++jj;
if (kstep == 2 && jp1 != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = k + k * w_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
w_dim1], &c__1);
}
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
i__1 = k + k * w_dim1;
i__2 = k + k * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
r__1 = w[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
if (k < *n) {
i__1 = *n - k;
ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
a_dim1], &c__1);
}
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = imax - k;
clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + imax * a_dim1;
r__1 = a[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
if (imax < *n) {
i__1 = *n - imax;
ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
imax + 1 + (k + 1) * w_dim1], &c__1);
}
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
1) * w_dim1], &c__1, (ftnlen)12);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + (k + 1) * w_dim1;
r__1 = w[i__2].r;
w[i__1].r = r__1, w[i__1].i = 0.f;
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (k + 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
i__1 = itemp + (k + 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (k + 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (k + 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p + p * a_dim1;
i__2 = k + k * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = p - k - 1;
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
a_dim1], lda);
i__1 = p - k - 1;
clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
if (p < *n) {
i__1 = *n - p;
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
* a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
}
cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
r__1 = a[i__2].r;
a[i__1].r = r__1, a[i__1].i = 0.f;
i__1 = kp - kk - 1;
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
i__1 = kp - kk - 1;
clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
t = a[i__1].r;
if (dabs(t) >= sfmin) {
r1 = 1.f / t;
i__1 = *n - k;
csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
i__3 = ii + k * a_dim1;
q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L74: */
}
}
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
r_cnjg(&q__2, &d21);
c_div(&q__1, &w[k + k * w_dim1], &q__2);
d22.r = q__1.r, d22.i = q__1.i;
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1.f / (q__1.r - 1.f);
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
r_cnjg(&q__5, &d21);
c_div(&q__2, &q__3, &q__5);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t * q__2.r, q__1.i = t * q__2.i;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = *n - k;
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = *n - k - 1;
clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
--jj;
if (kstep == 2 && jp1 != jj && j >= 1) {
cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

87
relapack/src/clauum.c Normal file
View File

@ -0,0 +1,87 @@
#include "relapack.h"
static void RELAPACK_clauum_rec(const char *, const int *, float *,
const int *, int *);
/** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
*
* This routine is functionally equivalent to LAPACK's clauum.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html
* */
void RELAPACK_clauum(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CLAUUM", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info);
}
/** clauum's recursive compute kernel */
static void RELAPACK_clauum_rec(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_CLAUUM, 1)) {
// Unblocked
LAPACK(clauu2)(uplo, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info);
if (*uplo == 'L') {
// A_TL = A_TL + A_BL' * A_BL
BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
// A_BL = A_BR' * A_BL
BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TL = A_TL + A_TR * A_TR'
BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
// A_TR = A_TR * A_BR'
BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info);
}

157
relapack/src/cpbtrf.c Normal file
View File

@ -0,0 +1,157 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *,
float *, const int *, float *, const int *, int *);
/** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
*
* This routine is functionally equivalent to LAPACK's cpbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html
* */
void RELAPACK_cpbtrf(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kd < 0)
*info = -3;
else if (*ldAb < *kd + 1)
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CPBTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Constant
const float ZERO[] = { 0., 0. };
// Allocate work space
const int n1 = CREC_SPLIT(*n);
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
float *Work = malloc(mWork * nWork * 2 * sizeof(float));
LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
// Recursive kernel
RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
// Free work space
free(Work);
}
/** cpbtrf's recursive compute kernel */
static void RELAPACK_cpbtrf_rec(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
float *Work, const int *ldWork,
int *info
){
if (*n <= MAX(CROSSOVER_CPBTRF, 1)) {
// Unblocked
LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
// Splitting
const int n1 = MIN(CREC_SPLIT(*n), *kd);
const int n2 = *n - n1;
// * *
// * Ab_BR
float *const Ab_BR = Ab + 2 * *ldAb * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
// Banded splitting
const int n21 = MIN(n2, *kd - n1);
const int n22 = MIN(n2 - n21, *kd);
// n1 n21 n22
// n1 * A_TRl A_TRr
// n21 A_BLt A_BRtl A_BRtr
// n22 A_BLb A_BRbl A_BRbr
float *const A_TRl = A_TR;
float *const A_TRr = A_TR + 2 * *ldA * n21;
float *const A_BLt = A_BL;
float *const A_BLb = A_BL + 2 * n21;
float *const A_BRtl = A_BR;
float *const A_BRtr = A_BR + 2 * *ldA * n21;
float *const A_BRbl = A_BR + 2 * n21;
float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21;
if (*uplo == 'L') {
// A_BLt = ABLt / A_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
// A_BRtl = A_BRtl - A_BLt * A_BLt'
BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
// Work = A_BLb
LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
// Work = Work / A_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
// A_BRbl = A_BRbl - Work * A_BLt'
BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
// A_BRbr = A_BRbr - Work * Work'
BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_BLb = Work
LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
} else {
// A_TRl = A_TL' \ A_TRl
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// A_BRtl = A_BRtl - A_TRl' * A_TRl
BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
// Work = A_TRr
LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
// Work = A_TL' \ Work
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
// A_BRtr = A_BRtr - A_TRl' * Work
BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Work' * Work
BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_TRr = Work
LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
}
// recursion(A_BR)
if (*kd > n1)
RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info);
else
RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
if (*info)
*info += n1;
}

92
relapack/src/cpotrf.c Normal file
View File

@ -0,0 +1,92 @@
#include "relapack.h"
static void RELAPACK_cpotrf_rec(const char *, const int *, float *,
const int *, int *);
/** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
*
* This routine is functionally equivalent to LAPACK's cpotrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html
* */
void RELAPACK_cpotrf(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CPOTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info);
}
/** cpotrf's recursive compute kernel */
static void RELAPACK_cpotrf_rec(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_CPOTRF, 1)) {
// Unblocked
LAPACK(cpotf2)(uplo, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = A_BL / A_TL'
BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
// A_BR = A_BR - A_BL * A_BL'
BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
} else {
// A_TR = A_TL' \ A_TR
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_TR' * A_TR
BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
}
// recursion(A_BR)
RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

238
relapack/src/csytrf.c Normal file
View File

@ -0,0 +1,238 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's csytrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html
* */
void RELAPACK_csytrf(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy arguments
int nout;
// Recursive kernel
RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** csytrf's recursive compute kernel */
static void RELAPACK_csytrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CSYTRF, 3)) {
// Unblocked
if (top) {
LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Loop iterator
int i;
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

451
relapack/src/csytrf_rec2.c Normal file
View File

@ -0,0 +1,451 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
*
* This routine is a minor modification of LAPACK's clasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int *
nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2, r__3, r__4;
complex q__1, q__2, q__3;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k;
static complex t, r1, d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static float alpha;
extern /* Subroutine */ int cscal_(int *, complex *, complex *,
int *);
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen), ccopy_(int *, complex *, int *,
complex *, int *), cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float absakk;
extern int icamax_(int *, complex *, int *);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
kstep = 1;
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
w_dim1]), dabs(r__2));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (kw - 1) * w_dim1]), dabs(r__2));
if (imax > 1) {
i__1 = imax - 1;
jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
i__1 = jmax + (kw - 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (kw - 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha *
rowmax) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kk - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = k - 1;
cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + kw * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
c_div(&q__1, &t, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
i__1 = *n - k + 1;
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
kstep = 1;
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
w_dim1]), dabs(r__2));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
jmax + (k + 1) * w_dim1]), dabs(r__2));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
i__1 = jmax + (k + 1) * w_dim1;
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
r__2));
rowmax = dmax(r__3,r__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (k + 1) * w_dim1;
if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha *
rowmax) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
k * w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k + kstep - 1;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kp - kk - 1;
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = *n - k;
cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k + k * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
c_div(&q__1, &t, &d21);
d21.r = q__1.r, d21.i = q__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
.i;
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
d21.r * q__2.i + d21.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/csytrf_rook.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's csytrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html
* */
void RELAPACK_csytrf_rook(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** csytrf_rook's recursive compute kernel */
static void RELAPACK_csytrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = CREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
float *const A_BL_B = A + 2 * *n;
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + 2 * n1;
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = CREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + 2 * *ldA * n_rest;
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@ -0,0 +1,565 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static complex c_b1 = {1.f,0.f};
static int c__1 = 1;
/** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
*
* This routine is a minor modification of LAPACK's clasyf_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, complex *a, int *lda, int *ipiv,
complex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
float r__1, r__2;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
double sqrt(double), r_imag(complex *);
void c_div(complex *, complex *, complex *);
/* Local variables */
static int j, k, p;
static complex t, r1, d11, d12, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static float alpha;
extern /* Subroutine */ int cscal_(int *, complex *, complex *,
int *);
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
, complex *, int *, complex *, int *, complex *, complex *
, int *, ftnlen);
static float sfmin;
extern /* Subroutine */ int ccopy_(int *, complex *, int *,
complex *, int *);
static int itemp;
extern /* Subroutine */ int cswap_(int *, complex *, int *,
complex *, int *);
static int kstep;
static float stemp, absakk;
extern int icamax_(int *, complex *, int *);
extern double slamch_(char *, ftnlen);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
sfmin = slamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
i__1 = k + kw * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
w_dim1]), dabs(r__2));
if (k > 1) {
i__1 = k - 1;
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ kw * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
i__1 = itemp + (kw - 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (kw - 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha *
rowmax)) {
kp = imax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = k - p;
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
c__1);
i__1 = *n - k + 1;
cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
lda);
i__1 = *n - kk + 1;
cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
i__1 = kp + k * a_dim1;
i__2 = kk + k * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = k - 1 - kp;
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
c__1);
i__1 = *n - kk + 1;
cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
lda);
i__1 = *n - kk + 1;
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+ k * a_dim1]), dabs(r__2)) >= sfmin) {
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = k - 1;
cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else /* if(complicated condition) */ {
i__1 = k + k * a_dim1;
if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
a_dim1]);
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L14: */
}
}
}
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d12.r = w[i__1].r, d12.i = w[i__1].i;
c_div(&q__1, &w[k + kw * w_dim1], &d12);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d12);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d12);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
jj = j - 1;
if (jp1 != jj && kstep == 2) {
i__1 = *n - j + 1;
cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = *n - k + 1;
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
}
i__1 = k + k * w_dim1;
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
w_dim1]), dabs(r__2));
if (k < *n) {
i__1 = *n - k;
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ k * w_dim1]), dabs(r__2));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
q__1.r = -1.f, q__1.i = -0.f;
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
1) * w_dim1], &c__1, (ftnlen)12);
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[jmax + (k + 1) * w_dim1]), dabs(r__2));
} else {
rowmax = 0.f;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
i__1 = itemp + (k + 1) * w_dim1;
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
w[itemp + (k + 1) * w_dim1]), dabs(r__2));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
i__1 = imax + (k + 1) * w_dim1;
if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha *
rowmax)) {
kp = imax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p - k;
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
lda);
i__1 = *n - p + 1;
ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
c__1);
cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
i__1 = kp + k * a_dim1;
i__2 = kk + k * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kp - k - 1;
ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
* a_dim1], lda);
i__1 = *n - kp + 1;
ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
a_dim1], &c__1);
cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+ k * a_dim1]), dabs(r__2)) >= sfmin) {
c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
r1.r = q__1.r, r1.i = q__1.i;
i__1 = *n - k;
cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else /* if(complicated condition) */ {
i__1 = k + k * a_dim1;
if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
a_dim1]);
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L74: */
}
}
}
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = q__1.r, d11.i = q__1.i;
c_div(&q__1, &w[k + k * w_dim1], &d21);
d22.r = q__1.r, d22.i = q__1.i;
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
d22.i + d11.i * d22.r;
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
c_div(&q__1, &c_b1, &q__2);
t.r = q__1.r, t.i = q__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
.i;
c_div(&q__2, &q__3, &d21);
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
q__2.i + t.i * q__2.r;
a[i__2].r = q__1.r, a[i__2].i = q__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
jj = j + 1;
if (jp1 != jj && kstep == 2) {
cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

268
relapack/src/ctgsyl.c Normal file
View File

@ -0,0 +1,268 @@
#include "relapack.h"
#include <math.h>
static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *,
const int *, const float *, const int *, const float *, const int *,
float *, const int *, const float *, const int *, const float *,
const int *, float *, const int *, float *, float *, float *, int *);
/** CTGSYL solves the generalized Sylvester equation.
*
* This routine is functionally equivalent to LAPACK's ctgsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html
* */
void RELAPACK_ctgsyl(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
// Parse arguments
const int notran = LAPACK(lsame)(trans, "N");
const int tran = LAPACK(lsame)(trans, "C");
// Compute work buffer size
int lwmin = 1;
if (notran && (*ijob == 1 || *ijob == 2))
lwmin = MAX(1, 2 * *m * *n);
*info = 0;
// Check arguments
if (!tran && !notran)
*info = -1;
else if (notran && (*ijob < 0 || *ijob > 4))
*info = -2;
else if (*m <= 0)
*info = -3;
else if (*n <= 0)
*info = -4;
else if (*ldA < MAX(1, *m))
*info = -6;
else if (*ldB < MAX(1, *n))
*info = -8;
else if (*ldC < MAX(1, *m))
*info = -10;
else if (*ldD < MAX(1, *m))
*info = -12;
else if (*ldE < MAX(1, *n))
*info = -14;
else if (*ldF < MAX(1, *m))
*info = -16;
else if (*lWork < lwmin && *lWork != -1)
*info = -20;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CTGSYL", &minfo);
return;
}
if (*lWork == -1) {
// Work size query
*Work = lwmin;
return;
}
// Clean char * arguments
const char cleantrans = notran ? 'N' : 'C';
// Constant
const float ZERO[] = { 0., 0. };
int isolve = 1;
int ifunc = 0;
if (notran) {
if (*ijob >= 3) {
ifunc = *ijob - 2;
LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
} else if (*ijob >= 1)
isolve = 2;
}
float scale2;
int iround;
for (iround = 1; iround <= isolve; iround++) {
*scale = 1;
float dscale = 0;
float dsum = 1;
RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
if (dscale != 0) {
if (*ijob == 1 || *ijob == 3)
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
else
*dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
}
if (isolve == 2) {
if (iround == 1) {
if (notran)
ifunc = *ijob;
scale2 = *scale;
LAPACK(clacpy)("F", m, n, C, ldC, Work, m);
LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
} else {
LAPACK(clacpy)("F", m, n, Work, m, C, ldC);
LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
*scale = scale2;
}
}
}
}
/** ctgsyl's recursive vompute kernel */
static void RELAPACK_ctgsyl_rec(
const char *trans, const int *ifunc, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dsum, float *dscale,
int *info
) {
if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) {
// Unblocked
LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Outputs
float scale1[] = { 1., 0. };
float scale2[] = { 1., 0. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
const int m1 = CREC_SPLIT(*m);
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const float *const A_TL = A;
const float *const A_TR = A + 2 * *ldA * m1;
const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
// C_T
// C_B
float *const C_T = C;
float *const C_B = C + 2 * m1;
// D_TL D_TR
// 0 D_BR
const float *const D_TL = D;
const float *const D_TR = D + 2 * *ldD * m1;
const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1;
// F_T
// F_B
float *const F_T = F;
float *const F_B = F + 2 * m1;
if (*trans == 'N') {
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
// C_T = C_T - A_TR * C_B
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// F_T = F_T - D_TR * C_B
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
}
} else {
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
// C_B = C_B - A_TR^H * C_T
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// C_B = C_B - D_TR^H * F_T
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
}
}
} else {
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const float *const B_TL = B;
const float *const B_TR = B + 2 * *ldB * n1;
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// C_L C_R
float *const C_L = C;
float *const C_R = C + 2 * *ldC * n1;
// E_TL E_TR
// 0 E_BR
const float *const E_TL = E;
const float *const E_TR = E + 2 * *ldE * n1;
const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1;
// F_L F_R
float *const F_L = F;
float *const F_R = F + 2 * *ldF * n1;
if (*trans == 'N') {
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
// C_R = C_R + F_L * B_TR
BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
// F_R = F_R + F_L * E_TR
BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
}
} else {
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
// F_L = F_L + C_R * B_TR
BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
// F_L = F_L + F_R * E_TR
BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
}
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

163
relapack/src/ctrsyl.c Normal file
View File

@ -0,0 +1,163 @@
#include "relapack.h"
static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *,
const int *, const int *, const float *, const int *, const float *,
const int *, float *, const int *, float *, int *);
/** CTRSYL solves the complex Sylvester matrix equation.
*
* This routine is functionally equivalent to LAPACK's ctrsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html
* */
void RELAPACK_ctrsyl(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
// Check arguments
const int notransA = LAPACK(lsame)(tranA, "N");
const int ctransA = LAPACK(lsame)(tranA, "C");
const int notransB = LAPACK(lsame)(tranB, "N");
const int ctransB = LAPACK(lsame)(tranB, "C");
*info = 0;
if (!ctransA && !notransA)
*info = -1;
else if (!ctransB && !notransB)
*info = -2;
else if (*isgn != 1 && *isgn != -1)
*info = -3;
else if (*m < 0)
*info = -4;
else if (*n < 0)
*info = -5;
else if (*ldA < MAX(1, *m))
*info = -7;
else if (*ldB < MAX(1, *n))
*info = -9;
else if (*ldC < MAX(1, *m))
*info = -11;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CTRSYL", &minfo);
return;
}
// Clean char * arguments
const char cleantranA = notransA ? 'N' : 'C';
const char cleantranB = notransB ? 'N' : 'C';
// Recursive kernel
RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
/** ctrsyl's recursive compute kernel */
static void RELAPACK_ctrsyl_rec(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) {
// Unblocked
RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
const float MSGN[] = { -*isgn, 0. };
const int iONE[] = { 1 };
// Outputs
float scale1[] = { 1., 0. };
float scale2[] = { 1., 0. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
const int m1 = CREC_SPLIT(*m);
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const float *const A_TL = A;
const float *const A_TR = A + 2 * *ldA * m1;
const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
// C_T
// C_B
float *const C_T = C;
float *const C_B = C + 2 * m1;
if (*tranA == 'N') {
// recusion(A_BR, B, C_B)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
// C_T = C_T - A_TR * C_B
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// recusion(A_TL, B, C_T)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
} else {
// recusion(A_TL, B, C_T)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
// C_B = C_B - A_TR' * C_T
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// recusion(A_BR, B, C_B)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
}
} else {
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const float *const B_TL = B;
const float *const B_TR = B + 2 * *ldB * n1;
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// C_L C_R
float *const C_L = C;
float *const C_R = C + 2 * *ldC * n1;
if (*tranB == 'N') {
// recusion(A, B_TL, C_L)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
// C_R = C_R -/+ C_L * B_TR
BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
// recusion(A, B_BR, C_R)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
} else {
// recusion(A, B_BR, C_R)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
// C_L = C_L -/+ C_R * B_TR'
BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
// recusion(A, B_TL, C_L)
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

392
relapack/src/ctrsyl_rec2.c Normal file
View File

@ -0,0 +1,392 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "../config.h"
#include "f2c.h"
#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
extern void cdotu_(complex *, int *, complex *, int *, complex *, int *);
complex result;
cdotu_(&result, n, x, incx, y, incy);
return result;
}
#define cdotu_ cdotu_fun
complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
extern void cdotc_(complex *, int *, complex *, int *, complex *, int *);
complex result;
cdotc_(&result, n, x, incx, y, incy);
return result;
}
#define cdotc_ cdotc_fun
#endif
#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
complex cladiv_fun(complex *a, complex *b) {
extern void cladiv_(complex *, complex *, complex *);
complex result;
cladiv_(&result, a, b);
return result;
}
#define cladiv_ cladiv_fun
#endif
/* Table of constant values */
static int c__1 = 1;
/** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
*
* This routine is an exact copy of LAPACK's ctrsyl.
* It serves as an unblocked kernel in the recursive algorithms.
* */
/* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int
*isgn, int *m, int *n, complex *a, int *lda, complex *b,
int *ldb, complex *c__, int *ldc, float *scale, int *info,
ftnlen trana_len, ftnlen tranb_len)
{
/* System generated locals */
int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
i__3, i__4;
float r__1, r__2;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
float r_imag(complex *);
void r_cnjg(complex *, complex *);
/* Local variables */
static int j, k, l;
static complex a11;
static float db;
static complex x11;
static float da11;
static complex vec;
static float dum[1], eps, sgn, smin;
static complex suml, sumr;
/* Complex */ complex cdotc_(int *, complex *, int
*, complex *, int *);
extern int lsame_(char *, char *, ftnlen, ftnlen);
/* Complex */ complex cdotu_(int *, complex *, int
*, complex *, int *);
extern /* Subroutine */ int slabad_(float *, float *);
extern float clange_(char *, int *, int *, complex *,
int *, float *, ftnlen);
/* Complex */ complex cladiv_(complex *, complex *);
static float scaloc;
extern float slamch_(char *, ftnlen);
extern /* Subroutine */ int csscal_(int *, float *, complex *, int
*), xerbla_(char *, int *, ftnlen);
static float bignum;
static int notrna, notrnb;
static float smlnum;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
b_dim1 = *ldb;
b_offset = 1 + b_dim1;
b -= b_offset;
c_dim1 = *ldc;
c_offset = 1 + c_dim1;
c__ -= c_offset;
/* Function Body */
notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
*info = 0;
if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
*info = -1;
} else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
*info = -2;
} else if (*isgn != 1 && *isgn != -1) {
*info = -3;
} else if (*m < 0) {
*info = -4;
} else if (*n < 0) {
*info = -5;
} else if (*lda < max(1,*m)) {
*info = -7;
} else if (*ldb < max(1,*n)) {
*info = -9;
} else if (*ldc < max(1,*m)) {
*info = -11;
}
if (*info != 0) {
i__1 = -(*info);
xerbla_("CTRSY2", &i__1, (ftnlen)6);
return;
}
*scale = 1.f;
if (*m == 0 || *n == 0) {
return;
}
eps = slamch_("P", (ftnlen)1);
smlnum = slamch_("S", (ftnlen)1);
bignum = 1.f / smlnum;
slabad_(&smlnum, &bignum);
smlnum = smlnum * (float) (*m * *n) / eps;
bignum = 1.f / smlnum;
/* Computing MAX */
r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, (
ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n,
&b[b_offset], ldb, dum, (ftnlen)1);
smin = dmax(r__1,r__2);
sgn = (float) (*isgn);
if (notrna && notrnb) {
i__1 = *n;
for (l = 1; l <= i__1; ++l) {
for (k = *m; k >= 1; --k) {
i__2 = *m - k;
/* Computing MIN */
i__3 = k + 1;
/* Computing MIN */
i__4 = k + 1;
q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
min(i__4,*m) + l * c_dim1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__2 = l - 1;
q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
, &c__1);
sumr.r = q__1.r, sumr.i = q__1.i;
i__2 = k + l * c_dim1;
q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
i__2 = k + k * a_dim1;
i__3 = l + l * b_dim1;
q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i;
q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i;
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__2 = *n;
for (j = 1; j <= i__2; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L10: */
}
*scale *= scaloc;
}
i__2 = k + l * c_dim1;
c__[i__2].r = x11.r, c__[i__2].i = x11.i;
/* L20: */
}
/* L30: */
}
} else if (! notrna && notrnb) {
i__1 = *n;
for (l = 1; l <= i__1; ++l) {
i__2 = *m;
for (k = 1; k <= i__2; ++k) {
i__3 = k - 1;
q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
c_dim1 + 1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__3 = l - 1;
q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
, &c__1);
sumr.r = q__1.r, sumr.i = q__1.i;
i__3 = k + l * c_dim1;
q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
r_cnjg(&q__2, &a[k + k * a_dim1]);
i__3 = l + l * b_dim1;
q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L40: */
}
*scale *= scaloc;
}
i__3 = k + l * c_dim1;
c__[i__3].r = x11.r, c__[i__3].i = x11.i;
/* L50: */
}
/* L60: */
}
} else if (! notrna && ! notrnb) {
for (l = *n; l >= 1; --l) {
i__1 = *m;
for (k = 1; k <= i__1; ++k) {
i__2 = k - 1;
q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
c_dim1 + 1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__2 = *n - l;
/* Computing MIN */
i__3 = l + 1;
/* Computing MIN */
i__4 = l + 1;
q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
l + min(i__4,*n) * b_dim1], ldb);
sumr.r = q__1.r, sumr.i = q__1.i;
i__2 = k + l * c_dim1;
r_cnjg(&q__4, &sumr);
q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
i__2 = k + k * a_dim1;
i__3 = l + l * b_dim1;
q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i;
r_cnjg(&q__1, &q__2);
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__2 = *n;
for (j = 1; j <= i__2; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L70: */
}
*scale *= scaloc;
}
i__2 = k + l * c_dim1;
c__[i__2].r = x11.r, c__[i__2].i = x11.i;
/* L80: */
}
/* L90: */
}
} else if (notrna && ! notrnb) {
for (l = *n; l >= 1; --l) {
for (k = *m; k >= 1; --k) {
i__1 = *m - k;
/* Computing MIN */
i__2 = k + 1;
/* Computing MIN */
i__3 = k + 1;
q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
min(i__3,*m) + l * c_dim1], &c__1);
suml.r = q__1.r, suml.i = q__1.i;
i__1 = *n - l;
/* Computing MIN */
i__2 = l + 1;
/* Computing MIN */
i__3 = l + 1;
q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
l + min(i__3,*n) * b_dim1], ldb);
sumr.r = q__1.r, sumr.i = q__1.i;
i__1 = k + l * c_dim1;
r_cnjg(&q__4, &sumr);
q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i;
vec.r = q__1.r, vec.i = q__1.i;
scaloc = 1.f;
i__1 = k + k * a_dim1;
r_cnjg(&q__3, &b[l + l * b_dim1]);
q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i;
q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i;
a11.r = q__1.r, a11.i = q__1.i;
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
dabs(r__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.f;
da11 = smin;
*info = 1;
}
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
r__2));
if (da11 < 1.f && db > 1.f) {
if (db > bignum * da11) {
scaloc = 1.f / db;
}
}
q__3.r = scaloc, q__3.i = 0.f;
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
q__3.i + vec.i * q__3.r;
q__1 = cladiv_(&q__2, &a11);
x11.r = q__1.r, x11.i = q__1.i;
if (scaloc != 1.f) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L100: */
}
*scale *= scaloc;
}
i__1 = k + l * c_dim1;
c__[i__1].r = x11.r, c__[i__1].i = x11.i;
/* L110: */
}
/* L120: */
}
}
return;
}

107
relapack/src/ctrtri.c Normal file
View File

@ -0,0 +1,107 @@
#include "relapack.h"
static void RELAPACK_ctrtri_rec(const char *, const char *, const int *,
float *, const int *, int *);
/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
*
* This routine is functionally equivalent to LAPACK's ctrtri.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html
* */
void RELAPACK_ctrtri(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int nounit = LAPACK(lsame)(diag, "N");
const int unit = LAPACK(lsame)(diag, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (!nounit && !unit)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("CTRTRI", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleandiag = nounit ? 'N' : 'U';
// check for singularity
if (nounit) {
int i;
for (i = 0; i < *n; i++)
if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
*info = i;
return;
}
}
// Recursive kernel
RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
}
/** ctrtri's recursive compute kernel */
static void RELAPACK_ctrtri_rec(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_CTRTRI, 1)) {
// Unblocked
LAPACK(ctrti2)(uplo, diag, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1., 0. };
const float MONE[] = { -1., 0. };
// Splitting
const int n1 = CREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + 2 * *ldA * n1;
float *const A_BL = A + 2 * n1;
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = - A_BL * A_TL
BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
// A_BL = A_BR \ A_BL
BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TR = - A_TL * A_TR
BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
// A_TR = A_TR / A_BR
BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

227
relapack/src/dgbtrf.c Normal file
View File

@ -0,0 +1,227 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *,
const int *, double *, const int *, int *, double *, const int *, double *,
const int *, int *);
/** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's dgbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html
* */
void RELAPACK_dgbtrf(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kl < 0)
*info = -3;
else if (*ku < 0)
*info = -4;
else if (*ldAb < 2 * *kl + *ku + 1)
*info = -6;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DGBTRF", &minfo);
return;
}
// Constant
const double ZERO[] = { 0. };
// Result upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + kv;
// Zero upper diagonal fill-in elements
int i, j;
for (j = 0; j < *n; j++) {
double *const A_j = A + *ldA * j;
for (i = MAX(0, j - kv); i < j - *ku; i++)
A_j[i] = 0.;
}
// Allocate work space
const int n1 = DREC_SPLIT(*n);
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const int nWorkl = (kv > n1) ? n1 : kv;
const int mWorku = (*kl > n1) ? n1 : *kl;
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
double *Workl = malloc(mWorkl * nWorkl * sizeof(double));
double *Worku = malloc(mWorku * nWorku * sizeof(double));
LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
// Free work space
free(Workl);
free(Worku);
}
/** dgbtrf's recursive compute kernel */
static void RELAPACK_dgbtrf_rec(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
int *info
) {
if (*n <= MAX(CROSSOVER_DGBTRF, 1)) {
// Unblocked
LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
// Loop iterators
int i, j;
// Output upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + kv;
// Splitting
const int n1 = MIN(DREC_SPLIT(*n), *kl);
const int n2 = *n - n1;
const int m1 = MIN(n1, *m);
const int m2 = *m - m1;
const int mn1 = MIN(m1, n1);
const int mn2 = MIN(m2, n2);
// Ab_L *
// Ab_BR
double *const Ab_L = Ab;
double *const Ab_BR = Ab + *ldAb * n1;
// A_L A_R
double *const A_L = A;
double *const A_R = A + *ldA * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + m1;
double *const A_BR = A + *ldA * n1 + m1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// Banded splitting
const int n21 = MIN(n2, kv - n1);
const int n22 = MIN(n2 - n21, n1);
const int m21 = MIN(m2, *kl - m1);
const int m22 = MIN(m2 - m21, m1);
// n1 n21 n22
// m * A_Rl ARr
double *const A_Rl = A_R;
double *const A_Rr = A_R + *ldA * n21;
// n1 n21 n22
// m1 * A_TRl A_TRr
// m21 A_BLt A_BRtl A_BRtr
// m22 A_BLb A_BRbl A_BRbr
double *const A_TRl = A_TR;
double *const A_TRr = A_TR + *ldA * n21;
double *const A_BLt = A_BL;
double *const A_BLb = A_BL + m21;
double *const A_BRtl = A_BR;
double *const A_BRtr = A_BR + *ldA * n21;
double *const A_BRbl = A_BR + m21;
double *const A_BRbr = A_BR + *ldA * n21 + m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
// Workl = A_BLb
LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
// partially redo swaps in A_L
for (i = 0; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
else
BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
}
}
// apply pivots to A_Rl
LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
// apply pivots to A_Rr columnwise
for (j = 0; j < n22; j++) {
double *const A_Rrj = A_Rr + *ldA * j;
for (i = j; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
const double tmp = A_Rrj[i];
A_Rrj[i] = A_Rr[ip];
A_Rrj[ip] = tmp;
}
}
}
// A_TRl = A_TL \ A_TRl
BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// Worku = A_TRr
LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
// Worku = A_TL \ Worku
BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
// A_TRr = Worku
LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
// A_BRtl = A_BRtl - A_BLt * A_TRl
BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
// A_BRbl = A_BRbl - Workl * A_TRl
BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
// A_BRtr = A_BRtr - A_BLt * Worku
BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Workl * Worku
BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
// partially undo swaps in A_L
for (i = mn1 - 1; i >= 0; i--) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
else
BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
if (*info)
*info += n1;
// shift pivots
for (i = 0; i < mn2; i++)
ipiv_B[i] += n1;
}

165
relapack/src/dgemmt.c Normal file
View File

@ -0,0 +1,165 @@
#include "relapack.h"
static void RELAPACK_dgemmt_rec(const char *, const char *, const char *,
const int *, const int *, const double *, const double *, const int *,
const double *, const int *, const double *, double *, const int *);
static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *,
const int *, const int *, const double *, const double *, const int *,
const double *, const int *, const double *, double *, const int *);
/** DGEMMT computes a matrix-matrix product with general matrices but updates
* only the upper or lower triangular part of the result matrix.
*
* This routine performs the same operation as the BLAS routine
* dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
* but only updates the triangular part of C specified by uplo:
* If (*uplo == 'L'), only the lower triangular part of C is updated,
* otherwise the upper triangular part is updated.
* */
void RELAPACK_dgemmt(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
#if HAVE_XGEMMT
BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
#else
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int notransA = LAPACK(lsame)(transA, "N");
const int tranA = LAPACK(lsame)(transA, "T");
const int notransB = LAPACK(lsame)(transB, "N");
const int tranB = LAPACK(lsame)(transB, "T");
int info = 0;
if (!lower && !upper)
info = 1;
else if (!tranA && !notransA)
info = 2;
else if (!tranB && !notransB)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*ldA < MAX(1, notransA ? *n : *k))
info = 8;
else if (*ldB < MAX(1, notransB ? *k : *n))
info = 10;
else if (*ldC < MAX(1, *n))
info = 13;
if (info) {
LAPACK(xerbla)("DGEMMT", &info);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleantransA = notransA ? 'N' : 'T';
const char cleantransB = notransB ? 'N' : 'T';
// Recursive kernel
RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
#endif
}
/** dgemmt's recursive compute kernel */
static void RELAPACK_dgemmt_rec(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
if (*n <= MAX(CROSSOVER_DGEMMT, 1)) {
// Unblocked
RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
}
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_T
// A_B
const double *const A_T = A;
const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);
// B_L B_R
const double *const B_L = B;
const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);
// C_TL C_TR
// C_BL C_BR
double *const C_TL = C;
double *const C_TR = C + *ldC * n1;
double *const C_BL = C + n1;
double *const C_BR = C + *ldC * n1 + n1;
// recursion(C_TL)
RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
if (*uplo == 'L')
// C_BL = alpha A_B B_L + beta C_BL
BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
else
// C_TR = alpha A_T B_R + beta C_TR
BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
// recursion(C_BR)
RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
}
/** dgemmt's unblocked compute kernel */
static void RELAPACK_dgemmt_rec2(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
const int incB = (*transB == 'N') ? 1 : *ldB;
const int incC = 1;
int i;
for (i = 0; i < *n; i++) {
// A_0
// A_i
const double *const A_0 = A;
const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i);
// * B_i *
const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i);
// * C_0i *
// * C_ii *
double *const C_0i = C + *ldC * i;
double *const C_ii = C + *ldC * i + i;
if (*uplo == 'L') {
const int nmi = *n - i;
if (*transA == 'N')
BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
else
BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
} else {
const int ip1 = i + 1;
if (*transA == 'N')
BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
else
BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
}
}
}

117
relapack/src/dgetrf.c Normal file
View File

@ -0,0 +1,117 @@
#include "relapack.h"
static void RELAPACK_dgetrf_rec(const int *, const int *, double *,
const int *, int *, int *);
/** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's dgetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html
* */
void RELAPACK_dgetrf(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DGETRF", &minfo);
return;
}
const int sn = MIN(*m, *n);
RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
if (*m < *n) {
// Constants
const double ONE[] = { 1. };
const int iONE[] = { 1. };
// Splitting
const int rn = *n - *m;
// A_L A_R
const double *const A_L = A;
double *const A_R = A + *ldA * *m;
// A_R = apply(ipiv, A_R)
LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
// A_R = A_S \ A_R
BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
}
}
/** dgetrf's recursive compute kernel */
static void RELAPACK_dgetrf_rec(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
if (*n <= MAX(CROSSOVER_DGETRF, 1)) {
// Unblocked
LAPACK(dgetf2)(m, n, A, ldA, ipiv, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
const int m2 = *m - n1;
// A_L A_R
double *const A_L = A;
double *const A_R = A + *ldA * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// recursion(A_L, ipiv_T)
RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
// apply pivots to A_R
LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
// A_TR = A_TL \ A_TR
BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_BL * A_TR
BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
// recursion(A_BR, ipiv_B)
RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
if (*info)
*info += n1;
// apply pivots to A_BL
LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
// shift pivots
int i;
for (i = 0; i < n2; i++)
ipiv_B[i] += n1;
}

87
relapack/src/dlauum.c Normal file
View File

@ -0,0 +1,87 @@
#include "relapack.h"
static void RELAPACK_dlauum_rec(const char *, const int *, double *,
const int *, int *);
/** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
*
* This routine is functionally equivalent to LAPACK's dlauum.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html
* */
void RELAPACK_dlauum(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DLAUUM", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info);
}
/** dlauum's recursive compute kernel */
static void RELAPACK_dlauum_rec(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_DLAUUM, 1)) {
// Unblocked
LAPACK(dlauu2)(uplo, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1. };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info);
if (*uplo == 'L') {
// A_TL = A_TL + A_BL' * A_BL
BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
// A_BL = A_BR' * A_BL
BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TL = A_TL + A_TR * A_TR'
BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
// A_TR = A_TR * A_BR'
BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info);
}

157
relapack/src/dpbtrf.c Normal file
View File

@ -0,0 +1,157 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *,
double *, const int *, double *, const int *, int *);
/** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
*
* This routine is functionally equivalent to LAPACK's dpbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html
* */
void RELAPACK_dpbtrf(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kd < 0)
*info = -3;
else if (*ldAb < *kd + 1)
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DPBTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Constant
const double ZERO[] = { 0. };
// Allocate work space
const int n1 = DREC_SPLIT(*n);
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
double *Work = malloc(mWork * nWork * sizeof(double));
LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
// Recursive kernel
RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
// Free work space
free(Work);
}
/** dpbtrf's recursive compute kernel */
static void RELAPACK_dpbtrf_rec(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
double *Work, const int *ldWork,
int *info
){
if (*n <= MAX(CROSSOVER_DPBTRF, 1)) {
// Unblocked
LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
// Splitting
const int n1 = MIN(DREC_SPLIT(*n), *kd);
const int n2 = *n - n1;
// * *
// * Ab_BR
double *const Ab_BR = Ab + *ldAb * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
// Banded splitting
const int n21 = MIN(n2, *kd - n1);
const int n22 = MIN(n2 - n21, n1);
// n1 n21 n22
// n1 * A_TRl A_TRr
// n21 A_BLt A_BRtl A_BRtr
// n22 A_BLb A_BRbl A_BRbr
double *const A_TRl = A_TR;
double *const A_TRr = A_TR + *ldA * n21;
double *const A_BLt = A_BL;
double *const A_BLb = A_BL + n21;
double *const A_BRtl = A_BR;
double *const A_BRtr = A_BR + *ldA * n21;
double *const A_BRbl = A_BR + n21;
double *const A_BRbr = A_BR + *ldA * n21 + n21;
if (*uplo == 'L') {
// A_BLt = ABLt / A_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
// A_BRtl = A_BRtl - A_BLt * A_BLt'
BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
// Work = A_BLb
LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
// Work = Work / A_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
// A_BRbl = A_BRbl - Work * A_BLt'
BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
// A_BRbr = A_BRbr - Work * Work'
BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_BLb = Work
LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
} else {
// A_TRl = A_TL' \ A_TRl
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// A_BRtl = A_BRtl - A_TRl' * A_TRl
BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
// Work = A_TRr
LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
// Work = A_TL' \ Work
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
// A_BRtr = A_BRtr - A_TRl' * Work
BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Work' * Work
BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_TRr = Work
LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
}
// recursion(A_BR)
if (*kd > n1)
RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info);
else
RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
if (*info)
*info += n1;
}

92
relapack/src/dpotrf.c Normal file
View File

@ -0,0 +1,92 @@
#include "relapack.h"
static void RELAPACK_dpotrf_rec(const char *, const int *, double *,
const int *, int *);
/** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
*
* This routine is functionally equivalent to LAPACK's dpotrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
* */
void RELAPACK_dpotrf(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DPOTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info);
}
/** dpotrf's recursive compute kernel */
static void RELAPACK_dpotrf_rec(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_DPOTRF, 1)) {
// Unblocked
LAPACK(dpotf2)(uplo, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = A_BL / A_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
// A_BR = A_BR - A_BL * A_BL'
BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
} else {
// A_TR = A_TL' \ A_TR
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_TR' * A_TR
BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
}
// recursion(A_BR)
RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

212
relapack/src/dsygst.c Normal file
View File

@ -0,0 +1,212 @@
#include "relapack.h"
#if XSYGST_ALLOW_MALLOC
#include "stdlib.h"
#endif
static void RELAPACK_dsygst_rec(const int *, const char *, const int *,
double *, const int *, const double *, const int *,
double *, const int *, int *);
/** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
*
* This routine is functionally equivalent to LAPACK's dsygst.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html
* */
void RELAPACK_dsygst(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (*itype < 1 || *itype > 3)
*info = -1;
else if (!lower && !upper)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
else if (*ldB < MAX(1, *n))
*info = -7;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DSYGST", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Allocate work space
double *Work = NULL;
int lWork = 0;
#if XSYGST_ALLOW_MALLOC
const int n1 = DREC_SPLIT(*n);
lWork = n1 * (*n - n1);
Work = malloc(lWork * sizeof(double));
if (!Work)
lWork = 0;
#endif
// recursive kernel
RELAPACK_dsygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
// Free work space
#if XSYGST_ALLOW_MALLOC
if (Work)
free(Work);
#endif
}
/** dsygst's recursive compute kernel */
static void RELAPACK_dsygst_rec(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
double *Work, const int *lWork, int *info
) {
if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
// Unblocked
LAPACK(dsygs2)(itype, uplo, n, A, ldA, B, ldB, info);
return;
}
// Constants
const double ZERO[] = { 0. };
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const double HALF[] = { .5 };
const double MHALF[] = { -.5 };
const int iONE[] = { 1 };
// Loop iterator
int i;
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// B_TL B_TR
// B_BL B_BR
const double *const B_TL = B;
const double *const B_TR = B + *ldB * n1;
const double *const B_BL = B + n1;
const double *const B_BR = B + *ldB * n1 + n1;
// recursion(A_TL, B_TL)
RELAPACK_dsygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
if (*itype == 1)
if (*uplo == 'L') {
// A_BL = A_BL / B_TL'
BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * B_BL * A_TL
BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
} else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
BLAS(dsyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR \ A_BL
BLAS(dtrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL' \ A_TR
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * A_TL * B_TR
BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
} else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
BLAS(dsyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR / B_BR
BLAS(dtrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
else
if (*uplo == 'L') {
// A_BL = A_BL * B_TL
BLAS(dtrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * A_BR * B_BL
BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
} else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
BLAS(dsyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR * A_BL
BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL * A_TR
BLAS(dtrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * B_TR * A_BR
BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
} else
// A_TR = A_TR + 1/2 B_TR A_BR
BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
BLAS(dsyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
else
// A_TR = A_TR + 1/2 B_TR * A_BR
BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR * B_BR
BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
// recursion(A_BR, B_BR)
RELAPACK_dsygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
}

238
relapack/src/dsytrf.c Normal file
View File

@ -0,0 +1,238 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's dsytrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html
* */
void RELAPACK_dsytrf(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy arguments
int nout;
// Recursive kernel
RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** dsytrf's recursive compute kernel */
static void RELAPACK_dsytrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_DSYTRF, 3)) {
// Unblocked
if (top) {
LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_dsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
// Loop iterator
int i;
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = DREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
double *const A_BL_B = A + *n;
double *const A_BR_B = A + *ldA * n1 + *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + n1;
double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = DREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + *ldA * n_rest;
double *const A_TR_T = A + *ldA * (n_rest + n1);
double *const A_TL = A + *ldA * n_rest + n_rest;
double *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

352
relapack/src/dsytrf_rec2.c Normal file
View File

@ -0,0 +1,352 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static int c__1 = 1;
static double c_b8 = -1.;
static double c_b9 = 1.;
/** DSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
*
* This routine is a minor modification of LAPACK's dlasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int *
nb, int *kb, double *a, int *lda, int *ipiv,
double *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
double d__1, d__2, d__3;
/* Builtin functions */
double sqrt(double);
/* Local variables */
static int j, k;
static double t, r1, d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static double alpha;
extern /* Subroutine */ int dscal_(int *, double *, double *,
int *);
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int dgemv_(char *, int *, int *,
double *, double *, int *, double *, int *,
double *, double *, int *, ftnlen), dcopy_(int *,
double *, int *, double *, int *), dswap_(int
*, double *, int *, double *, int *);
static int kstep;
static double absakk;
extern int idamax_(int *, double *, int *);
static double colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
kstep = 1;
absakk = (d__1 = w[k + kw * w_dim1], abs(d__1));
if (k > 1) {
i__1 = k - 1;
imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
i__1 = k - imax;
jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1));
if (imax > 1) {
i__1 = imax - 1;
jmax = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
d__2 = rowmax, d__3 = (d__1 = w[jmax + (kw - 1) * w_dim1],
abs(d__1));
rowmax = max(d__2,d__3);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else if ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) >=
alpha * rowmax) {
kp = imax;
dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
i__1 = kk - 1 - kp;
dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
dcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
dswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
r1 = 1. / a[k + k * a_dim1];
i__1 = k - 1;
dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
if (k > 2) {
d21 = w[k - 1 + kw * w_dim1];
d11 = w[k + kw * w_dim1] / d21;
d22 = w[k - 1 + (kw - 1) * w_dim1] / d21;
t = 1. / (d11 * d22 - 1.);
d21 = t / d21;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1)
* w_dim1] - w[j + kw * w_dim1]);
a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] -
w[j + (kw - 1) * w_dim1]);
/* L20: */
}
}
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
a[k + k * a_dim1] = w[k + kw * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
dswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
i__1 = *n - k + 1;
dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12);
kstep = 1;
absakk = (d__1 = w[k + k * w_dim1], abs(d__1));
if (k < *n) {
i__1 = *n - k;
imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
colmax = (d__1 = w[imax + k * w_dim1], abs(d__1));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax - k;
jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
d__2 = rowmax, d__3 = (d__1 = w[jmax + (k + 1) * w_dim1],
abs(d__1));
rowmax = max(d__2,d__3);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else if ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) >=
alpha * rowmax) {
kp = imax;
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
kk = k + kstep - 1;
if (kp != kk) {
a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
i__1 = kp - kk - 1;
dcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
dswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
r1 = 1. / a[k + k * a_dim1];
i__1 = *n - k;
dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
}
} else {
if (k < *n - 1) {
d21 = w[k + 1 + k * w_dim1];
d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
d22 = w[k + k * w_dim1] / d21;
t = 1. / (d11 * d22 - 1.);
d21 = t / d21;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] -
w[j + (k + 1) * w_dim1]);
a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) *
w_dim1] - w[j + k * w_dim1]);
/* L80: */
}
}
a[k + k * a_dim1] = w[k + k * w_dim1];
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
dswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/dsytrf_rook.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's dsytrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html
* */
void RELAPACK_dsytrf_rook(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** dsytrf_rook's recursive compute kernel */
static void RELAPACK_dsytrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_dsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = DREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
double *const A_BL_B = A + *n;
double *const A_BR_B = A + *ldA * n1 + *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + n1;
double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = DREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + *ldA * n_rest;
double *const A_TR_T = A + *ldA * (n_rest + n1);
double *const A_TL = A + *ldA * n_rest + n_rest;
double *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@ -0,0 +1,451 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static int c__1 = 1;
static double c_b9 = -1.;
static double c_b10 = 1.;
/** DSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method.
*
* This routine is a minor modification of LAPACK's dlasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, double *a, int *lda, int *ipiv,
double *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
double d__1;
/* Builtin functions */
double sqrt(double);
/* Local variables */
static int j, k, p;
static double t, r1, d11, d12, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static double alpha;
extern /* Subroutine */ int dscal_(int *, double *, double *,
int *);
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int dgemv_(char *, int *, int *,
double *, double *, int *, double *, int *,
double *, double *, int *, ftnlen);
static double dtemp, sfmin;
static int itemp;
extern /* Subroutine */ int dcopy_(int *, double *, int *,
double *, int *), dswap_(int *, double *, int
*, double *, int *);
static int kstep;
extern double dlamch_(char *, ftnlen);
static double absakk;
extern int idamax_(int *, double *, int *);
static double colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
sfmin = dlamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
absakk = (d__1 = w[k + kw * w_dim1], abs(d__1));
if (k > 1) {
i__1 = k - 1;
imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1));
} else {
rowmax = 0.;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
dtemp = (d__1 = w[itemp + (kw - 1) * w_dim1], abs(d__1));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
if (! ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) <
alpha * rowmax)) {
kp = imax;
dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = k - p;
dcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
dcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
c__1);
i__1 = *n - k + 1;
dswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
lda);
i__1 = *n - kk + 1;
dswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
a[kp + k * a_dim1] = a[kk + k * a_dim1];
i__1 = k - 1 - kp;
dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
dcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
c__1);
i__1 = *n - kk + 1;
dswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
lda);
i__1 = *n - kk + 1;
dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) {
r1 = 1. / a[k + k * a_dim1];
i__1 = k - 1;
dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else if (a[k + k * a_dim1] != 0.) {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
a[ii + k * a_dim1] /= a[k + k * a_dim1];
/* L14: */
}
}
}
} else {
if (k > 2) {
d12 = w[k - 1 + kw * w_dim1];
d11 = w[k + kw * w_dim1] / d12;
d22 = w[k - 1 + (kw - 1) * w_dim1] / d12;
t = 1. / (d11 * d22 - 1.);
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) *
w_dim1] - w[j + kw * w_dim1]) / d12);
a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] -
w[j + (kw - 1) * w_dim1]) / d12);
/* L20: */
}
}
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
a[k + k * a_dim1] = w[k + kw * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
dswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
jj = j - 1;
if (jp1 != jj && kstep == 2) {
i__1 = *n - j + 1;
dswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = *n - k + 1;
dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
}
absakk = (d__1 = w[k + k * w_dim1], abs(d__1));
if (k < *n) {
i__1 = *n - k;
imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
colmax = (d__1 = w[imax + k * w_dim1], abs(d__1));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k
+ 1) * w_dim1], &c__1, (ftnlen)12);
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1));
} else {
rowmax = 0.;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
dtemp = (d__1 = w[itemp + (k + 1) * w_dim1], abs(d__1));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
if (! ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) < alpha
* rowmax)) {
kp = imax;
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p - k;
dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
lda);
i__1 = *n - p + 1;
dcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
c__1);
dswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
dswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
a[kp + k * a_dim1] = a[kk + k * a_dim1];
i__1 = kp - k - 1;
dcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
* a_dim1], lda);
i__1 = *n - kp + 1;
dcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
a_dim1], &c__1);
dswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) {
r1 = 1. / a[k + k * a_dim1];
i__1 = *n - k;
dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else if (a[k + k * a_dim1] != 0.) {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
a[ii + k * a_dim1] /= a[k + k * a_dim1];
/* L74: */
}
}
}
} else {
if (k < *n - 1) {
d21 = w[k + 1 + k * w_dim1];
d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
d22 = w[k + k * w_dim1] / d21;
t = 1. / (d11 * d22 - 1.);
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[
j + (k + 1) * w_dim1]) / d21);
a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) *
w_dim1] - w[j + k * w_dim1]) / d21);
/* L80: */
}
}
a[k + k * a_dim1] = w[k + k * w_dim1];
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
dswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
jj = j + 1;
if (jp1 != jj && kstep == 2) {
dswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

274
relapack/src/dtgsyl.c Normal file
View File

@ -0,0 +1,274 @@
#include "relapack.h"
#include <math.h>
static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *,
const int *, const double *, const int *, const double *, const int *,
double *, const int *, const double *, const int *, const double *,
const int *, double *, const int *, double *, double *, double *, int *,
int *, int *);
/** DTGSYL solves the generalized Sylvester equation.
*
* This routine is functionally equivalent to LAPACK's dtgsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html
* */
void RELAPACK_dtgsyl(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
// Parse arguments
const int notran = LAPACK(lsame)(trans, "N");
const int tran = LAPACK(lsame)(trans, "T");
// Compute work buffer size
int lwmin = 1;
if (notran && (*ijob == 1 || *ijob == 2))
lwmin = MAX(1, 2 * *m * *n);
*info = 0;
// Check arguments
if (!tran && !notran)
*info = -1;
else if (notran && (*ijob < 0 || *ijob > 4))
*info = -2;
else if (*m <= 0)
*info = -3;
else if (*n <= 0)
*info = -4;
else if (*ldA < MAX(1, *m))
*info = -6;
else if (*ldB < MAX(1, *n))
*info = -8;
else if (*ldC < MAX(1, *m))
*info = -10;
else if (*ldD < MAX(1, *m))
*info = -12;
else if (*ldE < MAX(1, *n))
*info = -14;
else if (*ldF < MAX(1, *m))
*info = -16;
else if (*lWork < lwmin && *lWork != -1)
*info = -20;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DTGSYL", &minfo);
return;
}
if (*lWork == -1) {
// Work size query
*Work = lwmin;
return;
}
// Clean char * arguments
const char cleantrans = notran ? 'N' : 'T';
// Constant
const double ZERO[] = { 0. };
int isolve = 1;
int ifunc = 0;
if (notran) {
if (*ijob >= 3) {
ifunc = *ijob - 2;
LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF);
} else if (*ijob >= 1)
isolve = 2;
}
double scale2;
int iround;
for (iround = 1; iround <= isolve; iround++) {
*scale = 1;
double dscale = 0;
double dsum = 1;
int pq;
RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
if (dscale != 0) {
if (*ijob == 1 || *ijob == 3)
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
else
*dif = sqrt(pq) / (dscale * sqrt(dsum));
}
if (isolve == 2) {
if (iround == 1) {
if (notran)
ifunc = *ijob;
scale2 = *scale;
LAPACK(dlacpy)("F", m, n, C, ldC, Work, m);
LAPACK(dlacpy)("F", m, n, F, ldF, Work + *m * *n, m);
LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF);
} else {
LAPACK(dlacpy)("F", m, n, Work, m, C, ldC);
LAPACK(dlacpy)("F", m, n, Work + *m * *n, m, F, ldF);
*scale = scale2;
}
}
}
}
/** dtgsyl's recursive vompute kernel */
static void RELAPACK_dtgsyl_rec(
const char *trans, const int *ifunc, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dsum, double *dscale,
int *iWork, int *pq, int *info
) {
if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) {
// Unblocked
LAPACK(dtgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const int iONE[] = { 1 };
// Outputs
double scale1[] = { 1. };
double scale2[] = { 1. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
int m1 = DREC_SPLIT(*m);
if (A[m1 + *ldA * (m1 - 1)])
m1++;
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const double *const A_TL = A;
const double *const A_TR = A + *ldA * m1;
const double *const A_BR = A + *ldA * m1 + m1;
// C_T
// C_B
double *const C_T = C;
double *const C_B = C + m1;
// D_TL D_TR
// 0 D_BR
const double *const D_TL = D;
const double *const D_TR = D + *ldD * m1;
const double *const D_BR = D + *ldD * m1 + m1;
// F_T
// F_B
double *const F_T = F;
double *const F_B = F + m1;
if (*trans == 'N') {
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1);
// C_T = C_T - A_TR * C_B
BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// F_T = F_T - D_TR * C_B
BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
}
} else {
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
// C_B = C_B - A_TR^H * C_T
BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// C_B = C_B - D_TR^H * F_T
BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
}
}
} else {
// Splitting
int n1 = DREC_SPLIT(*n);
if (B[n1 + *ldB * (n1 - 1)])
n1++;
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const double *const B_TL = B;
const double *const B_TR = B + *ldB * n1;
const double *const B_BR = B + *ldB * n1 + n1;
// C_L C_R
double *const C_L = C;
double *const C_R = C + *ldC * n1;
// E_TL E_TR
// 0 E_BR
const double *const E_TL = E;
const double *const E_TR = E + *ldE * n1;
const double *const E_BR = E + *ldE * n1 + n1;
// F_L F_R
double *const F_L = F;
double *const F_R = F + *ldF * n1;
if (*trans == 'N') {
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1);
// C_R = C_R + F_L * B_TR
BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
// F_R = F_R + F_L * E_TR
BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
}
} else {
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
// F_L = F_L + C_R * B_TR
BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
// F_L = F_L + F_R * E_TR
BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
}
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

169
relapack/src/dtrsyl.c Normal file
View File

@ -0,0 +1,169 @@
#include "relapack.h"
static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *,
const int *, const int *, const double *, const int *, const double *,
const int *, double *, const int *, double *, int *);
/** DTRSYL solves the real Sylvester matrix equation.
*
* This routine is functionally equivalent to LAPACK's dtrsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html
* */
void RELAPACK_dtrsyl(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
// Check arguments
const int notransA = LAPACK(lsame)(tranA, "N");
const int transA = LAPACK(lsame)(tranA, "T");
const int ctransA = LAPACK(lsame)(tranA, "C");
const int notransB = LAPACK(lsame)(tranB, "N");
const int transB = LAPACK(lsame)(tranB, "T");
const int ctransB = LAPACK(lsame)(tranB, "C");
*info = 0;
if (!transA && !ctransA && !notransA)
*info = -1;
else if (!transB && !ctransB && !notransB)
*info = -2;
else if (*isgn != 1 && *isgn != -1)
*info = -3;
else if (*m < 0)
*info = -4;
else if (*n < 0)
*info = -5;
else if (*ldA < MAX(1, *m))
*info = -7;
else if (*ldB < MAX(1, *n))
*info = -9;
else if (*ldC < MAX(1, *m))
*info = -11;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DTRSYL", &minfo);
return;
}
// Clean char * arguments
const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C');
const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C');
// Recursive kernel
RELAPACK_dtrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
/** dtrsyl's recursive compute kernel */
static void RELAPACK_dtrsyl_rec(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) {
// Unblocked
RELAPACK_dtrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
const double MSGN[] = { -*isgn };
const int iONE[] = { 1 };
// Outputs
double scale1[] = { 1. };
double scale2[] = { 1. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
int m1 = DREC_SPLIT(*m);
if (A[m1 + *ldA * (m1 - 1)])
m1++;
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const double *const A_TL = A;
const double *const A_TR = A + *ldA * m1;
const double *const A_BR = A + *ldA * m1 + m1;
// C_T
// C_B
double *const C_T = C;
double *const C_B = C + m1;
if (*tranA == 'N') {
// recusion(A_BR, B, C_B)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
// C_T = C_T - A_TR * C_B
BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// recusion(A_TL, B, C_T)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
} else {
// recusion(A_TL, B, C_T)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
// C_B = C_B - A_TR' * C_T
BLAS(dgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// recusion(A_BR, B, C_B)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
}
} else {
// Splitting
int n1 = DREC_SPLIT(*n);
if (B[n1 + *ldB * (n1 - 1)])
n1++;
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const double *const B_TL = B;
const double *const B_TR = B + *ldB * n1;
const double *const B_BR = B + *ldB * n1 + n1;
// C_L C_R
double *const C_L = C;
double *const C_R = C + *ldC * n1;
if (*tranB == 'N') {
// recusion(A, B_TL, C_L)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
// C_R = C_R -/+ C_L * B_TR
BLAS(dgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
// recusion(A, B_BR, C_R)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
} else {
// recusion(A, B_BR, C_R)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
// C_L = C_L -/+ C_R * B_TR'
BLAS(dgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
// recusion(A, B_TL, C_L)
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

1034
relapack/src/dtrsyl_rec2.c Normal file

File diff suppressed because it is too large Load Diff

107
relapack/src/dtrtri.c Normal file
View File

@ -0,0 +1,107 @@
#include "relapack.h"
static void RELAPACK_dtrtri_rec(const char *, const char *, const int *,
double *, const int *, int *);
/** DTRTRI computes the inverse of a real upper or lower triangular matrix A.
*
* This routine is functionally equivalent to LAPACK's dtrtri.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html
* */
void RELAPACK_dtrtri(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int nounit = LAPACK(lsame)(diag, "N");
const int unit = LAPACK(lsame)(diag, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (!nounit && !unit)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("DTRTRI", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleandiag = nounit ? 'N' : 'U';
// check for singularity
if (nounit) {
int i;
for (i = 0; i < *n; i++)
if (A[i + *ldA * i] == 0) {
*info = i;
return;
}
}
// Recursive kernel
RELAPACK_dtrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
}
/** dtrtri's recursive compute kernel */
static void RELAPACK_dtrtri_rec(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_DTRTRI, 1)) {
// Unblocked
LAPACK(dtrti2)(uplo, diag, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
// Splitting
const int n1 = DREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + *ldA * n1;
double *const A_BL = A + n1;
double *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_dtrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = - A_BL * A_TL
BLAS(dtrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
// A_BL = A_BR \ A_BL
BLAS(dtrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TR = - A_TL * A_TR
BLAS(dtrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
// A_TR = A_TR / A_BR
BLAS(dtrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_dtrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

109
relapack/src/f2c.c Normal file
View File

@ -0,0 +1,109 @@
#include "stdlib.h"
#include "stdio.h"
#include "signal.h"
#include "f2c.h"
#ifndef SIGIOT
#ifdef SIGABRT
#define SIGIOT SIGABRT
#endif
#endif
void sig_die(const char *s, int kill) {
/* print error message, then clear buffers */
fprintf(stderr, "%s\n", s);
if(kill) {
fflush(stderr);
/* now get a core */
signal(SIGIOT, SIG_DFL);
abort();
} else
exit(1);
}
void c_div(complex *c, complex *a, complex *b) {
double ratio, den;
double abr, abi, cr;
if( (abr = b->r) < 0.)
abr = - abr;
if( (abi = b->i) < 0.)
abi = - abi;
if( abr <= abi ) {
if(abi == 0) {
#ifdef IEEE_COMPLEX_DIVIDE
float af, bf;
af = bf = abr;
if (a->i != 0 || a->r != 0)
af = 1.;
c->i = c->r = af / bf;
return;
#else
sig_die("complex division by zero", 1);
#endif
}
ratio = (double)b->r / b->i ;
den = b->i * (1 + ratio*ratio);
cr = (a->r*ratio + a->i) / den;
c->i = (a->i*ratio - a->r) / den;
} else {
ratio = (double)b->i / b->r ;
den = b->r * (1 + ratio*ratio);
cr = (a->r + a->i*ratio) / den;
c->i = (a->i - a->r*ratio) / den;
}
c->r = cr;
}
void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) {
double ratio, den;
double abr, abi, cr;
if( (abr = b->r) < 0.)
abr = - abr;
if( (abi = b->i) < 0.)
abi = - abi;
if( abr <= abi ) {
if(abi == 0) {
#ifdef IEEE_COMPLEX_DIVIDE
if (a->i != 0 || a->r != 0)
abi = 1.;
c->i = c->r = abi / abr;
return;
#else
sig_die("complex division by zero", 1);
#endif
}
ratio = b->r / b->i ;
den = b->i * (1 + ratio*ratio);
cr = (a->r*ratio + a->i) / den;
c->i = (a->i*ratio - a->r) / den;
} else {
ratio = b->i / b->r ;
den = b->r * (1 + ratio*ratio);
cr = (a->r + a->i*ratio) / den;
c->i = (a->i - a->r*ratio) / den;
}
c->r = cr;
}
float r_imag(complex *z) {
return z->i;
}
void r_cnjg(complex *r, complex *z) {
float zi = z->i;
r->r = z->r;
r->i = -zi;
}
double d_imag(doublecomplex *z) {
return z->i;
}
void d_cnjg(doublecomplex *r, doublecomplex *z) {
double zi = z->i;
r->r = z->r;
r->i = -zi;
}

223
relapack/src/f2c.h Normal file
View File

@ -0,0 +1,223 @@
/* f2c.h -- Standard Fortran to C header file */
/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed."
- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
#ifndef F2C_INCLUDE
#define F2C_INCLUDE
typedef long int integer;
typedef unsigned long int uinteger;
typedef char *address;
typedef short int shortint;
typedef float real;
typedef double doublereal;
typedef struct { real r, i; } complex;
typedef struct { doublereal r, i; } doublecomplex;
typedef long int logical;
typedef short int shortlogical;
typedef char logical1;
typedef char integer1;
#ifdef INTEGER_STAR_8 /* Adjust for integer*8. */
typedef long long longint; /* system-dependent */
typedef unsigned long long ulongint; /* system-dependent */
#define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b)))
#define qbit_set(a,b) ((a) | ((ulongint)1 << (b)))
#endif
#define TRUE_ (1)
#define FALSE_ (0)
/* Extern is for use with -E */
#ifndef Extern
#define Extern extern
#endif
/* I/O stuff */
#ifdef f2c_i2
/* for -i2 */
typedef short flag;
typedef short ftnlen;
typedef short ftnint;
#else
typedef long int flag;
typedef long int ftnlen;
typedef long int ftnint;
#endif
/*external read, write*/
typedef struct
{ flag cierr;
ftnint ciunit;
flag ciend;
char *cifmt;
ftnint cirec;
} cilist;
/*internal read, write*/
typedef struct
{ flag icierr;
char *iciunit;
flag iciend;
char *icifmt;
ftnint icirlen;
ftnint icirnum;
} icilist;
/*open*/
typedef struct
{ flag oerr;
ftnint ounit;
char *ofnm;
ftnlen ofnmlen;
char *osta;
char *oacc;
char *ofm;
ftnint orl;
char *oblnk;
} olist;
/*close*/
typedef struct
{ flag cerr;
ftnint cunit;
char *csta;
} cllist;
/*rewind, backspace, endfile*/
typedef struct
{ flag aerr;
ftnint aunit;
} alist;
/* inquire */
typedef struct
{ flag inerr;
ftnint inunit;
char *infile;
ftnlen infilen;
ftnint *inex; /*parameters in standard's order*/
ftnint *inopen;
ftnint *innum;
ftnint *innamed;
char *inname;
ftnlen innamlen;
char *inacc;
ftnlen inacclen;
char *inseq;
ftnlen inseqlen;
char *indir;
ftnlen indirlen;
char *infmt;
ftnlen infmtlen;
char *inform;
ftnint informlen;
char *inunf;
ftnlen inunflen;
ftnint *inrecl;
ftnint *innrec;
char *inblank;
ftnlen inblanklen;
} inlist;
#define VOID void
union Multitype { /* for multiple entry points */
integer1 g;
shortint h;
integer i;
/* longint j; */
real r;
doublereal d;
complex c;
doublecomplex z;
};
typedef union Multitype Multitype;
/*typedef long int Long;*/ /* No longer used; formerly in Namelist */
struct Vardesc { /* for Namelist */
char *name;
char *addr;
ftnlen *dims;
int type;
};
typedef struct Vardesc Vardesc;
struct Namelist {
char *name;
Vardesc **vars;
int nvars;
};
typedef struct Namelist Namelist;
#define abs(x) ((x) >= 0 ? (x) : -(x))
#define dabs(x) (doublereal)abs(x)
#define min(a,b) ((a) <= (b) ? (a) : (b))
#define max(a,b) ((a) >= (b) ? (a) : (b))
#define dmin(a,b) (doublereal)min(a,b)
#define dmax(a,b) (doublereal)max(a,b)
#define bit_test(a,b) ((a) >> (b) & 1)
#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b)))
#define bit_set(a,b) ((a) | ((uinteger)1 << (b)))
/* procedure parameter types for -A and -C++ */
#define F2C_proc_par_types 1
#ifdef __cplusplus
typedef int /* Unknown procedure type */ (*U_fp)(...);
typedef shortint (*J_fp)(...);
typedef integer (*I_fp)(...);
typedef real (*R_fp)(...);
typedef doublereal (*D_fp)(...), (*E_fp)(...);
typedef /* Complex */ VOID (*C_fp)(...);
typedef /* Double Complex */ VOID (*Z_fp)(...);
typedef logical (*L_fp)(...);
typedef shortlogical (*K_fp)(...);
typedef /* Character */ VOID (*H_fp)(...);
typedef /* Subroutine */ int (*S_fp)(...);
#else
typedef int /* Unknown procedure type */ (*U_fp)();
typedef shortint (*J_fp)();
typedef integer (*I_fp)();
typedef real (*R_fp)();
typedef doublereal (*D_fp)(), (*E_fp)();
typedef /* Complex */ VOID (*C_fp)();
typedef /* Double Complex */ VOID (*Z_fp)();
typedef logical (*L_fp)();
typedef shortlogical (*K_fp)();
typedef /* Character */ VOID (*H_fp)();
typedef /* Subroutine */ int (*S_fp)();
#endif
/* E_fp is for real functions when -R is not specified */
typedef VOID C_f; /* complex function */
typedef VOID H_f; /* character function */
typedef VOID Z_f; /* double complex function */
typedef doublereal E_f; /* real function with -R not specified */
/* undef any lower-case symbols that your C compiler predefines, e.g.: */
#ifndef Skip_f2c_Undefs
#undef cray
#undef gcos
#undef mc68010
#undef mc68020
#undef mips
#undef pdp11
#undef sgi
#undef sparc
#undef sun
#undef sun2
#undef sun3
#undef sun4
#undef u370
#undef u3b
#undef u3b2
#undef u3b5
#undef unix
#undef vax
#endif
#endif

80
relapack/src/lapack.h Normal file
View File

@ -0,0 +1,80 @@
#ifndef LAPACK_H
#define LAPACK_H
extern int LAPACK(lsame)(const char *, const char *);
extern int LAPACK(xerbla)(const char *, const int *);
extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *);
extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *);
extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *);
extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *);
extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *);
extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *);
extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *);
extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *);
extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *);
extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *);
extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *);
extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *);
extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *);
extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *);
extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *);
extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *);
extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *);
extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *);
extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *);
extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *);
extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *);
extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *);
extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *);
extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *);
#endif /* LAPACK_H */

View File

@ -0,0 +1,607 @@
#include "relapack.h"
////////////
// XLAUUM //
////////////
#if INCLUDE_SLAUUM
void LAPACK(slauum)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_slauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DLAUUM
void LAPACK(dlauum)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dlauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CLAUUM
void LAPACK(clauum)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_clauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZLAUUM
void LAPACK(zlauum)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_zlauum(uplo, n, A, ldA, info);
}
#endif
////////////
// XSYGST //
////////////
#if INCLUDE_SSYGST
void LAPACK(ssygst)(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_DSYGST
void LAPACK(dsygst)(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_CHEGST
void LAPACK(chegst)(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_ZHEGST
void LAPACK(zhegst)(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
////////////
// XTRTRI //
////////////
#if INCLUDE_STRTRI
void LAPACK(strtri)(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_strtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_DTRTRI
void LAPACK(dtrtri)(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_CTRTRI
void LAPACK(ctrtri)(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_ZTRTRI
void LAPACK(ztrtri)(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
}
#endif
////////////
// XPOTRF //
////////////
#if INCLUDE_SPOTRF
void LAPACK(spotrf)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_spotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DPOTRF
void LAPACK(dpotrf)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dpotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CPOTRF
void LAPACK(cpotrf)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_cpotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZPOTRF
void LAPACK(zpotrf)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_zpotrf(uplo, n, A, ldA, info);
}
#endif
////////////
// XPBTRF //
////////////
#if INCLUDE_SPBTRF
void LAPACK(spbtrf)(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_DPBTRF
void LAPACK(dpbtrf)(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_CPBTRF
void LAPACK(cpbtrf)(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_ZPBTRF
void LAPACK(zpbtrf)(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
////////////
// XSYTRF //
////////////
#if INCLUDE_SSYTRF
void LAPACK(ssytrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_DSYTRF
void LAPACK(dsytrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CSYTRF
void LAPACK(csytrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZSYTRF
void LAPACK(zsytrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CHETRF
void LAPACK(chetrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZHETRF
void LAPACK(zhetrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_SSYTRF_ROOK
void LAPACK(ssytrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_DSYTRF_ROOK
void LAPACK(dsytrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CSYTRF_ROOK
void LAPACK(csytrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZSYTRF_ROOK
void LAPACK(zsytrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CHETRF_ROOK
void LAPACK(chetrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZHETRF_ROOK
void LAPACK(zhetrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
////////////
// XGETRF //
////////////
#if INCLUDE_SGETRF
void LAPACK(sgetrf)(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_DGETRF
void LAPACK(dgetrf)(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_CGETRF
void LAPACK(cgetrf)(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_ZGETRF
void LAPACK(zgetrf)(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
}
#endif
////////////
// XGBTRF //
////////////
#if INCLUDE_SGBTRF
void LAPACK(sgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_DGBTRF
void LAPACK(dgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_CGBTRF
void LAPACK(cgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_ZGBTRF
void LAPACK(zgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
////////////
// XTRSYL //
////////////
#if INCLUDE_STRSYL
void LAPACK(strsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_DTRSYL
void LAPACK(dtrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_CTRSYL
void LAPACK(ctrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_ZTRSYL
void LAPACK(ztrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
////////////
// XTGSYL //
////////////
#if INCLUDE_STGSYL
void LAPACK(stgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_DTGSYL
void LAPACK(dtgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_CTGSYL
void LAPACK(ctgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_ZTGSYL
void LAPACK(ztgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
////////////
// XGEMMT //
////////////
#if INCLUDE_SGEMMT
void LAPACK(sgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
RELAPACK_sgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DGEMMT
void LAPACK(dgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
RELAPACK_dgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CGEMMT
void LAPACK(cgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
RELAPACK_cgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZGEMMT
void LAPACK(zgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
RELAPACK_zgemmt(uplo, n, A, ldA, info);
}
#endif

View File

@ -0,0 +1,607 @@
#include "relapack.h"
////////////
// XLAUUM //
////////////
#if INCLUDE_SLAUUM
void LAPACK(slauum)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_slauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DLAUUM
void LAPACK(dlauum)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dlauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CLAUUM
void LAPACK(clauum)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_clauum(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZLAUUM
void LAPACK(zlauum)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_zlauum(uplo, n, A, ldA, info);
}
#endif
////////////
// XSYGST //
////////////
#if INCLUDE_SSYGST
void LAPACK(ssygst)(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_DSYGST
void LAPACK(dsygst)(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_CSYGST
void LAPACK(csygst)(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
#if INCLUDE_ZSYGST
void LAPACK(zsygst)(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info);
}
#endif
////////////
// XTRTRI //
////////////
#if INCLUDE_STRTRI
void LAPACK(strtri)(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_strtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_DTRTRI
void LAPACK(dtrtri)(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_CTRTRI
void LAPACK(ctrtri)(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
}
#endif
#if INCLUDE_ZTRTRI
void LAPACK(ztrtri)(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
}
#endif
////////////
// XPOTRF //
////////////
#if INCLUDE_SPOTRF
void LAPACK(spotrf)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_spotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DPOTRF
void LAPACK(dpotrf)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_dpotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CPOTRF
void LAPACK(cpotrf)(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
RELAPACK_cpotrf(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZPOTRF
void LAPACK(zpotrf)(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
RELAPACK_zpotrf(uplo, n, A, ldA, info);
}
#endif
////////////
// XPBTRF //
////////////
#if INCLUDE_SPBTRF
void LAPACK(spbtrf)(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_DPBTRF
void LAPACK(dpbtrf)(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_CPBTRF
void LAPACK(cpbtrf)(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
#if INCLUDE_ZPBTRF
void LAPACK(zpbtrf)(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
}
#endif
////////////
// XSYTRF //
////////////
#if INCLUDE_SSYTRF
void LAPACK(ssytrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_DSYTRF
void LAPACK(dsytrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CSYTRF
void LAPACK(csytrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZSYTRF
void LAPACK(zsytrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CHETRF
void LAPACK(chetrf)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZHETRF
void LAPACK(zhetrf)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_SSYTRF_ROOK
void LAPACK(ssytrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_DSYTRF_ROOK
void LAPACK(dsytrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CSYTRF_ROOK
void LAPACK(csytrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZSYTRF_ROOK
void LAPACK(zsytrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_CHETRF_ROOK
void LAPACK(chetrf_rook)(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
#if INCLUDE_ZHETRF_ROOK
void LAPACK(zhetrf_rook)(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
}
#endif
////////////
// XGETRF //
////////////
#if INCLUDE_SGETRF
void LAPACK(sgetrf)(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_DGETRF
void LAPACK(dgetrf)(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_CGETRF
void LAPACK(cgetrf)(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
}
#endif
#if INCLUDE_ZGETRF
void LAPACK(zgetrf)(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
}
#endif
////////////
// XGBTRF //
////////////
#if INCLUDE_SGBTRF
void LAPACK(sgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_DGBTRF
void LAPACK(dgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_CGBTRF
void LAPACK(cgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
#if INCLUDE_ZGBTRF
void LAPACK(zgbtrf)(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
}
#endif
////////////
// XTRSYL //
////////////
#if INCLUDE_STRSYL
void LAPACK(strsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_DTRSYL
void LAPACK(dtrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_CTRSYL
void LAPACK(ctrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
#if INCLUDE_ZTRSYL
void LAPACK(ztrsyl)(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
#endif
////////////
// XTGSYL //
////////////
#if INCLUDE_STGSYL
void LAPACK(stgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_DTGSYL
void LAPACK(dtgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_CTGSYL
void LAPACK(ctgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
#if INCLUDE_ZTGSYL
void LAPACK(ztgsyl)(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
}
#endif
////////////
// XGEMMT //
////////////
#if INCLUDE_SGEMMT
void LAPACK(sgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
RELAPACK_sgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_DGEMMT
void LAPACK(dgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
RELAPACK_dgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_CGEMMT
void LAPACK(cgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
RELAPACK_cgemmt(uplo, n, A, ldA, info);
}
#endif
#if INCLUDE_ZGEMMT
void LAPACK(zgemmt)(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
RELAPACK_zgemmt(uplo, n, A, ldA, info);
}
#endif

60
relapack/src/relapack.h Normal file
View File

@ -0,0 +1,60 @@
#ifndef RELAPACK_INT_H
#define RELAPACK_INT_H
#include "../config.h"
#include "../inc/relapack.h"
// add an underscore to BLAS routines (or not)
#if BLAS_UNDERSCORE
#define BLAS(routine) routine ## _
#else
#define BLAS(routine) routine
#endif
// add an underscore to LAPACK routines (or not)
#if LAPACK_UNDERSCORE
#define LAPACK(routine) routine ## _
#else
#define LAPACK(routine) routine
#endif
// minimum and maximum macros
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
// REC_SPLIT(n) returns how a problem of size n is split recursively.
// If n >= 16, we ensure that the size of at least one of the halves is
// divisible by 8 (the cache line size in most CPUs), while both halves are
// still as close as possible in size.
// If n < 16 the problem is simply split in the middle. (Note that the
// crossoversize is usually larger than 16.)
#define SREC_SPLIT(n) ((n >= 32) ? ((n + 16) / 32) * 16 : n / 2)
#define DREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2)
#define CREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2)
#define ZREC_SPLIT(n) ((n >= 8) ? ((n + 4) / 8) * 4 : n / 2)
#include "lapack.h"
#include "blas.h"
// sytrf helper routines
void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
// trsyl helper routines
void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
#endif /* RELAPACK_INT_H */

227
relapack/src/sgbtrf.c Normal file
View File

@ -0,0 +1,227 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *,
const int *, float *, const int *, int *, float *, const int *, float *,
const int *, int *);
/** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's sgbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html
* */
void RELAPACK_sgbtrf(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kl < 0)
*info = -3;
else if (*ku < 0)
*info = -4;
else if (*ldAb < 2 * *kl + *ku + 1)
*info = -6;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SGBTRF", &minfo);
return;
}
// Constant
const float ZERO[] = { 0. };
// Result upper band width
const int kv = *ku + *kl;
// Unskewg A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + kv;
// Zero upper diagonal fill-in elements
int i, j;
for (j = 0; j < *n; j++) {
float *const A_j = A + *ldA * j;
for (i = MAX(0, j - kv); i < j - *ku; i++)
A_j[i] = 0.;
}
// Allocate work space
const int n1 = SREC_SPLIT(*n);
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const int nWorkl = (kv > n1) ? n1 : kv;
const int mWorku = (*kl > n1) ? n1 : *kl;
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
float *Workl = malloc(mWorkl * nWorkl * sizeof(float));
float *Worku = malloc(mWorku * nWorku * sizeof(float));
LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
// Free work space
free(Workl);
free(Worku);
}
/** sgbtrf's recursive compute kernel */
static void RELAPACK_sgbtrf_rec(
const int *m, const int *n, const int *kl, const int *ku,
float *Ab, const int *ldAb, int *ipiv,
float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
int *info
) {
if (*n <= MAX(CROSSOVER_SGBTRF, 1)) {
// Unblocked
LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const int iONE[] = { 1 };
// Loop iterators
int i, j;
// Output upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + kv;
// Splitting
const int n1 = MIN(SREC_SPLIT(*n), *kl);
const int n2 = *n - n1;
const int m1 = MIN(n1, *m);
const int m2 = *m - m1;
const int mn1 = MIN(m1, n1);
const int mn2 = MIN(m2, n2);
// Ab_L *
// Ab_BR
float *const Ab_L = Ab;
float *const Ab_BR = Ab + *ldAb * n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + *ldA * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + m1;
float *const A_BR = A + *ldA * n1 + m1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// Banded splitting
const int n21 = MIN(n2, kv - n1);
const int n22 = MIN(n2 - n21, n1);
const int m21 = MIN(m2, *kl - m1);
const int m22 = MIN(m2 - m21, m1);
// n1 n21 n22
// m * A_Rl ARr
float *const A_Rl = A_R;
float *const A_Rr = A_R + *ldA * n21;
// n1 n21 n22
// m1 * A_TRl A_TRr
// m21 A_BLt A_BRtl A_BRtr
// m22 A_BLb A_BRbl A_BRbr
float *const A_TRl = A_TR;
float *const A_TRr = A_TR + *ldA * n21;
float *const A_BLt = A_BL;
float *const A_BLb = A_BL + m21;
float *const A_BRtl = A_BR;
float *const A_BRtr = A_BR + *ldA * n21;
float *const A_BRbl = A_BR + m21;
float *const A_BRbr = A_BR + *ldA * n21 + m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
// Workl = A_BLb
LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
// partially redo swaps in A_L
for (i = 0; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
else
BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
}
}
// apply pivots to A_Rl
LAPACK(slaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
// apply pivots to A_Rr columnwise
for (j = 0; j < n22; j++) {
float *const A_Rrj = A_Rr + *ldA * j;
for (i = j; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
const float tmp = A_Rrj[i];
A_Rrj[i] = A_Rr[ip];
A_Rrj[ip] = tmp;
}
}
}
// A_TRl = A_TL \ A_TRl
BLAS(strsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// Worku = A_TRr
LAPACK(slacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
// Worku = A_TL \ Worku
BLAS(strsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
// A_TRr = Worku
LAPACK(slacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
// A_BRtl = A_BRtl - A_BLt * A_TRl
BLAS(sgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
// A_BRbl = A_BRbl - Workl * A_TRl
BLAS(sgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
// A_BRtr = A_BRtr - A_BLt * Worku
BLAS(sgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Workl * Worku
BLAS(sgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
// partially undo swaps in A_L
for (i = mn1 - 1; i >= 0; i--) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
else
BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
if (*info)
*info += n1;
// shift pivots
for (i = 0; i < mn2; i++)
ipiv_B[i] += n1;
}

165
relapack/src/sgemmt.c Normal file
View File

@ -0,0 +1,165 @@
#include "relapack.h"
static void RELAPACK_sgemmt_rec(const char *, const char *, const char *,
const int *, const int *, const float *, const float *, const int *,
const float *, const int *, const float *, float *, const int *);
static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *,
const int *, const int *, const float *, const float *, const int *,
const float *, const int *, const float *, float *, const int *);
/** SGEMMT computes a matrix-matrix product with general matrices but updates
* only the upper or lower triangular part of the result matrix.
*
* This routine performs the same operation as the BLAS routine
* sgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
* but only updates the triangular part of C specified by uplo:
* If (*uplo == 'L'), only the lower triangular part of C is updated,
* otherwise the upper triangular part is updated.
* */
void RELAPACK_sgemmt(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
#if HAVE_XGEMMT
BLAS(sgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
#else
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int notransA = LAPACK(lsame)(transA, "N");
const int tranA = LAPACK(lsame)(transA, "T");
const int notransB = LAPACK(lsame)(transB, "N");
const int tranB = LAPACK(lsame)(transB, "T");
int info = 0;
if (!lower && !upper)
info = 1;
else if (!tranA && !notransA)
info = 2;
else if (!tranB && !notransB)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*ldA < MAX(1, notransA ? *n : *k))
info = 8;
else if (*ldB < MAX(1, notransB ? *k : *n))
info = 10;
else if (*ldC < MAX(1, *n))
info = 13;
if (info) {
LAPACK(xerbla)("SGEMMT", &info);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleantransA = notransA ? 'N' : 'T';
const char cleantransB = notransB ? 'N' : 'T';
// Recursive kernel
RELAPACK_sgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
#endif
}
/** sgemmt's recursive compute kernel */
static void RELAPACK_sgemmt_rec(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
if (*n <= MAX(CROSSOVER_SGEMMT, 1)) {
// Unblocked
RELAPACK_sgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
}
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
// A_T
// A_B
const float *const A_T = A;
const float *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);
// B_L B_R
const float *const B_L = B;
const float *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);
// C_TL C_TR
// C_BL C_BR
float *const C_TL = C;
float *const C_TR = C + *ldC * n1;
float *const C_BL = C + n1;
float *const C_BR = C + *ldC * n1 + n1;
// recursion(C_TL)
RELAPACK_sgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
if (*uplo == 'L')
// C_BL = alpha A_B B_L + beta C_BL
BLAS(sgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
else
// C_TR = alpha A_T B_R + beta C_TR
BLAS(sgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
// recursion(C_BR)
RELAPACK_sgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
}
/** sgemmt's unblocked compute kernel */
static void RELAPACK_sgemmt_rec2(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const float *alpha, const float *A, const int *ldA,
const float *B, const int *ldB,
const float *beta, float *C, const int *ldC
) {
const int incB = (*transB == 'N') ? 1 : *ldB;
const int incC = 1;
int i;
for (i = 0; i < *n; i++) {
// A_0
// A_i
const float *const A_0 = A;
const float *const A_i = A + ((*transA == 'N') ? i : *ldA * i);
// * B_i *
const float *const B_i = B + ((*transB == 'N') ? *ldB * i : i);
// * C_0i *
// * C_ii *
float *const C_0i = C + *ldC * i;
float *const C_ii = C + *ldC * i + i;
if (*uplo == 'L') {
const int nmi = *n - i;
if (*transA == 'N')
BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
else
BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
} else {
const int ip1 = i + 1;
if (*transA == 'N')
BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
else
BLAS(sgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
}
}
}

117
relapack/src/sgetrf.c Normal file
View File

@ -0,0 +1,117 @@
#include "relapack.h"
static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *,
int *, int *);
/** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's sgetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html
* */
void RELAPACK_sgetrf(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SGETRF", &minfo);
return;
}
const int sn = MIN(*m, *n);
RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
if (*m < *n) {
// Constants
const float ONE[] = { 1. };
const int iONE[] = { 1. };
// Splitting
const int rn = *n - *m;
// A_L A_R
const float *const A_L = A;
float *const A_R = A + *ldA * *m;
// A_R = apply(ipiv, A_R)
LAPACK(slaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
// A_R = A_L \ A_R
BLAS(strsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
}
}
/** sgetrf's recursive compute kernel */
static void RELAPACK_sgetrf_rec(
const int *m, const int *n,
float *A, const int *ldA, int *ipiv,
int *info
) {
if (*n <= MAX(CROSSOVER_SGETRF, 1)) {
// Unblocked
LAPACK(sgetf2)(m, n, A, ldA, ipiv, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const int iONE[] = { 1 };
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
const int m2 = *m - n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + *ldA * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// recursion(A_L, ipiv_T)
RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
// apply pivots to A_R
LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
// A_TR = A_TL \ A_TR
BLAS(strsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_BL * A_TR
BLAS(sgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
// recursion(A_BR, ipiv_B)
RELAPACK_sgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
if (*info)
*info += n1;
// apply pivots to A_BL
LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
// shift pivots
int i;
for (i = 0; i < n2; i++)
ipiv_B[i] += n1;
}

87
relapack/src/slauum.c Normal file
View File

@ -0,0 +1,87 @@
#include "relapack.h"
static void RELAPACK_slauum_rec(const char *, const int *, float *,
const int *, int *);
/** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
*
* This routine is functionally equivalent to LAPACK's slauum.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html
* */
void RELAPACK_slauum(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SLAUUM", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_slauum_rec(&cleanuplo, n, A, ldA, info);
}
/** slauum's recursive compute kernel */
static void RELAPACK_slauum_rec(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_SLAUUM, 1)) {
// Unblocked
LAPACK(slauu2)(uplo, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1. };
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_slauum_rec(uplo, &n1, A_TL, ldA, info);
if (*uplo == 'L') {
// A_TL = A_TL + A_BL' * A_BL
BLAS(ssyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
// A_BL = A_BR' * A_BL
BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TL = A_TL + A_TR * A_TR'
BLAS(ssyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
// A_TR = A_TR * A_BR'
BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_slauum_rec(uplo, &n2, A_BR, ldA, info);
}

157
relapack/src/spbtrf.c Normal file
View File

@ -0,0 +1,157 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_spbtrf_rec(const char *, const int *, const int *,
float *, const int *, float *, const int *, int *);
/** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
*
* This routine is functionally equivalent to LAPACK's spbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html
* */
void RELAPACK_spbtrf(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kd < 0)
*info = -3;
else if (*ldAb < *kd + 1)
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SPBTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Constant
const float ZERO[] = { 0. };
// Allocate work space
const int n1 = SREC_SPLIT(*n);
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
float *Work = malloc(mWork * nWork * sizeof(float));
LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
// Recursive kernel
RELAPACK_spbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
// Free work space
free(Work);
}
/** spbtrf's recursive compute kernel */
static void RELAPACK_spbtrf_rec(
const char *uplo, const int *n, const int *kd,
float *Ab, const int *ldAb,
float *Work, const int *ldWork,
int *info
){
if (*n <= MAX(CROSSOVER_SPBTRF, 1)) {
// Unblocked
LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
// Unskew A
const int ldA[] = { *ldAb - 1 };
float *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
// Splitting
const int n1 = MIN(SREC_SPLIT(*n), *kd);
const int n2 = *n - n1;
// * *
// * Ab_BR
float *const Ab_BR = Ab + *ldAb * n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_spotrf(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
// Banded splitting
const int n21 = MIN(n2, *kd - n1);
const int n22 = MIN(n2 - n21, *kd);
// n1 n21 n22
// n1 * A_TRl A_TRr
// n21 A_BLt A_BRtl A_BRtr
// n22 A_BLb A_BRbl A_BRbr
float *const A_TRl = A_TR;
float *const A_TRr = A_TR + *ldA * n21;
float *const A_BLt = A_BL;
float *const A_BLb = A_BL + n21;
float *const A_BRtl = A_BR;
float *const A_BRtr = A_BR + *ldA * n21;
float *const A_BRbl = A_BR + n21;
float *const A_BRbr = A_BR + *ldA * n21 + n21;
if (*uplo == 'L') {
// A_BLt = ABLt / A_TL'
BLAS(strsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
// A_BRtl = A_BRtl - A_BLt * A_BLt'
BLAS(ssyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
// Work = A_BLb
LAPACK(slacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
// Work = Work / A_TL'
BLAS(strsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
// A_BRbl = A_BRbl - Work * A_BLt'
BLAS(sgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
// A_BRbr = A_BRbr - Work * Work'
BLAS(ssyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_BLb = Work
LAPACK(slacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
} else {
// A_TRl = A_TL' \ A_TRl
BLAS(strsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// A_BRtl = A_BRtl - A_TRl' * A_TRl
BLAS(ssyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
// Work = A_TRr
LAPACK(slacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
// Work = A_TL' \ Work
BLAS(strsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
// A_BRtr = A_BRtr - A_TRl' * Work
BLAS(sgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Work' * Work
BLAS(ssyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_TRr = Work
LAPACK(slacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
}
// recursion(A_BR)
if (*kd > n1)
RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info);
else
RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
if (*info)
*info += n1;
}

92
relapack/src/spotrf.c Normal file
View File

@ -0,0 +1,92 @@
#include "relapack.h"
static void RELAPACK_spotrf_rec(const char *, const int *, float *,
const int *, int *);
/** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
*
* This routine is functionally equivalent to LAPACK's spotrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html
* */
void RELAPACK_spotrf(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SPOTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_spotrf_rec(&cleanuplo, n, A, ldA, info);
}
/** spotrf's recursive compute kernel */
static void RELAPACK_spotrf_rec(
const char *uplo, const int *n,
float *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_SPOTRF, 1)) {
// Unblocked
LAPACK(spotf2)(uplo, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_spotrf_rec(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = A_BL / A_TL'
BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
// A_BR = A_BR - A_BL * A_BL'
BLAS(ssyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
} else {
// A_TR = A_TL' \ A_TR
BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_TR' * A_TR
BLAS(ssyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
}
// recursion(A_BR)
RELAPACK_spotrf_rec(uplo, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

212
relapack/src/ssygst.c Normal file
View File

@ -0,0 +1,212 @@
#include "relapack.h"
#if XSYGST_ALLOW_MALLOC
#include "stdlib.h"
#endif
static void RELAPACK_ssygst_rec(const int *, const char *, const int *,
float *, const int *, const float *, const int *,
float *, const int *, int *);
/** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
*
* This routine is functionally equivalent to LAPACK's ssygst.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html
* */
void RELAPACK_ssygst(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (*itype < 1 || *itype > 3)
*info = -1;
else if (!lower && !upper)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
else if (*ldB < MAX(1, *n))
*info = -7;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SSYGST", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Allocate work space
float *Work = NULL;
int lWork = 0;
#if XSYGST_ALLOW_MALLOC
const int n1 = SREC_SPLIT(*n);
lWork = n1 * (*n - n1);
Work = malloc(lWork * sizeof(float));
if (!Work)
lWork = 0;
#endif
// Recursive kernel
RELAPACK_ssygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
// Free work space
#if XSYGST_ALLOW_MALLOC
if (Work)
free(Work);
#endif
}
/** ssygst's recursive compute kernel */
static void RELAPACK_ssygst_rec(
const int *itype, const char *uplo, const int *n,
float *A, const int *ldA, const float *B, const int *ldB,
float *Work, const int *lWork, int *info
) {
if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
// Unblocked
LAPACK(ssygs2)(itype, uplo, n, A, ldA, B, ldB, info);
return;
}
// Constants
const float ZERO[] = { 0. };
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const float HALF[] = { .5 };
const float MHALF[] = { -.5 };
const int iONE[] = { 1 };
// Loop iterator
int i;
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// B_TL B_TR
// B_BL B_BR
const float *const B_TL = B;
const float *const B_TR = B + *ldB * n1;
const float *const B_BL = B + n1;
const float *const B_BR = B + *ldB * n1 + n1;
// recursion(A_TL, B_TL)
RELAPACK_ssygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
if (*itype == 1)
if (*uplo == 'L') {
// A_BL = A_BL / B_TL'
BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * B_BL * A_TL
BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
} else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
BLAS(ssyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR \ A_BL
BLAS(strsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL' \ A_TR
BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = -1/2 * A_TL * B_TR
BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
} else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
BLAS(ssyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
if (*lWork > n2 * n1)
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR / B_BR
BLAS(strsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
else
if (*uplo == 'L') {
// A_BL = A_BL * B_TL
BLAS(strmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * A_BR * B_BL
BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
} else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
BLAS(ssyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR * A_BL
BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL * A_TR
BLAS(strmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork > n2 * n1) {
// T = 1/2 * B_TR * A_BR
BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
} else
// A_TR = A_TR + 1/2 B_TR A_BR
BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
BLAS(ssyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
if (*lWork > n2 * n1)
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
else
// A_TR = A_TR + 1/2 B_TR * A_BR
BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR * B_BR
BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
// recursion(A_BR, B_BR)
RELAPACK_ssygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
}

238
relapack/src/ssytrf.c Normal file
View File

@ -0,0 +1,238 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's ssytrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html
* */
void RELAPACK_ssytrf(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy arguments
int nout;
// Recursive kernel
RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** ssytrf's recursive compute kernel */
static void RELAPACK_ssytrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_SSYTRF, 3)) {
// Unblocked
if (top) {
LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_ssytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const int iONE[] = { 1 };
// Loop iterator
int i;
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = SREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
float *const A_BL_B = A + *n;
float *const A_BR_B = A + *ldA * n1 + *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + n1;
float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = SREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + *ldA * n_rest;
float *const A_TR_T = A + *ldA * (n_rest + n1);
float *const A_TL = A + *ldA * n_rest + n_rest;
float *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

351
relapack/src/ssytrf_rec2.c Normal file
View File

@ -0,0 +1,351 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static int c__1 = 1;
static float c_b8 = -1.f;
static float c_b9 = 1.f;
/** SSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
*
* This routine is a minor modification of LAPACK's slasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int *
nb, int *kb, float *a, int *lda, int *ipiv, float *w,
int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
float r__1, r__2, r__3;
/* Builtin functions */
double sqrt(double);
/* Local variables */
static int j, k;
static float t, r1, d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static float alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int sscal_(int *, float *, float *, int *),
sgemv_(char *, int *, int *, float *, float *, int *,
float *, int *, float *, float *, int *, ftnlen);
static int kstep;
extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
int *), sswap_(int *, float *, int *, float *, int *
);
static float absakk;
extern int isamax_(int *, float *, int *);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
kstep = 1;
absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1));
if (k > 1) {
i__1 = k - 1;
imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
i__1 = k - imax;
jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1));
if (imax > 1) {
i__1 = imax - 1;
jmax = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
r__2 = rowmax, r__3 = (r__1 = w[jmax + (kw - 1) * w_dim1],
dabs(r__1));
rowmax = dmax(r__2,r__3);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else if ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) >=
alpha * rowmax) {
kp = imax;
scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
i__1 = kk - 1 - kp;
scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
scopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
sswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
r1 = 1.f / a[k + k * a_dim1];
i__1 = k - 1;
sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
if (k > 2) {
d21 = w[k - 1 + kw * w_dim1];
d11 = w[k + kw * w_dim1] / d21;
d22 = w[k - 1 + (kw - 1) * w_dim1] / d21;
t = 1.f / (d11 * d22 - 1.f);
d21 = t / d21;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1)
* w_dim1] - w[j + kw * w_dim1]);
a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] -
w[j + (kw - 1) * w_dim1]);
/* L20: */
}
}
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
a[k + k * a_dim1] = w[k + kw * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
sswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
i__1 = *n - k + 1;
scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12);
kstep = 1;
absakk = (r__1 = w[k + k * w_dim1], dabs(r__1));
if (k < *n) {
i__1 = *n - k;
imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax - k;
jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
r__2 = rowmax, r__3 = (r__1 = w[jmax + (k + 1) * w_dim1],
dabs(r__1));
rowmax = dmax(r__2,r__3);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else if ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) >=
alpha * rowmax) {
kp = imax;
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
kk = k + kstep - 1;
if (kp != kk) {
a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
i__1 = kp - kk - 1;
scopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
sswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
r1 = 1.f / a[k + k * a_dim1];
i__1 = *n - k;
sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
}
} else {
if (k < *n - 1) {
d21 = w[k + 1 + k * w_dim1];
d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
d22 = w[k + k * w_dim1] / d21;
t = 1.f / (d11 * d22 - 1.f);
d21 = t / d21;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] -
w[j + (k + 1) * w_dim1]);
a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) *
w_dim1] - w[j + k * w_dim1]);
/* L80: */
}
}
a[k + k * a_dim1] = w[k + k * w_dim1];
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
sswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/ssytrf_rook.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *,
float *, const int *, int *, float *, const int *, int *);
/** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's ssytrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html
* */
void RELAPACK_ssytrf_rook(
const char *uplo, const int *n,
float *A, const int *ldA, int *ipiv,
float *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
float *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * sizeof(float));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("SSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** ssytrf_rook's recursive compute kernel */
static void RELAPACK_ssytrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
float *A, const int *ldA, int *ipiv,
float *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_ssytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = SREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
float *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
float *const A_BL_B = A + *n;
float *const A_BR_B = A + *ldA * n1 + *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
float *const Work_BL = Work + n1;
float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
// last row of A_BL
float *const A_BL_b = A_BL + n2_out;
// last row of Work_BL
float *const Work_BL_b = Work_BL + n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = SREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
float *const Work_R = top ? Work : Work + *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
float *const A_TL_T = A + *ldA * n_rest;
float *const A_TR_T = A + *ldA * (n_rest + n1);
float *const A_TL = A + *ldA * n_rest + n_rest;
float *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
float *const Work_L = Work;
float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@ -0,0 +1,451 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static int c__1 = 1;
static float c_b9 = -1.f;
static float c_b10 = 1.f;
/** SSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method.
*
* This routine is a minor modification of LAPACK's slasyf_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, float *a, int *lda, int *ipiv, float *
w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
float r__1;
/* Builtin functions */
double sqrt(double);
/* Local variables */
static int j, k, p;
static float t, r1, d11, d12, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static float alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int sscal_(int *, float *, float *, int *);
static float sfmin;
static int itemp;
extern /* Subroutine */ int sgemv_(char *, int *, int *, float *,
float *, int *, float *, int *, float *, float *, int *,
ftnlen);
static int kstep;
static float stemp;
extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
int *), sswap_(int *, float *, int *, float *, int *
);
static float absakk;
extern double slamch_(char *, ftnlen);
extern int isamax_(int *, float *, int *);
static float colmax, rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.f) + 1.f) / 8.f;
sfmin = slamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1));
if (k > 1) {
i__1 = k - 1;
imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1));
} else {
rowmax = 0.f;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
stemp = (r__1 = w[itemp + (kw - 1) * w_dim1], dabs(r__1));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
if (! ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) <
alpha * rowmax)) {
kp = imax;
scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = k - p;
scopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
scopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
c__1);
i__1 = *n - k + 1;
sswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
lda);
i__1 = *n - kk + 1;
sswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
a[kp + k * a_dim1] = a[kk + k * a_dim1];
i__1 = k - 1 - kp;
scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
scopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
c__1);
i__1 = *n - kk + 1;
sswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
lda);
i__1 = *n - kk + 1;
sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) {
r1 = 1.f / a[k + k * a_dim1];
i__1 = k - 1;
sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else if (a[k + k * a_dim1] != 0.f) {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
a[ii + k * a_dim1] /= a[k + k * a_dim1];
/* L14: */
}
}
}
} else {
if (k > 2) {
d12 = w[k - 1 + kw * w_dim1];
d11 = w[k + kw * w_dim1] / d12;
d22 = w[k - 1 + (kw - 1) * w_dim1] / d12;
t = 1.f / (d11 * d22 - 1.f);
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) *
w_dim1] - w[j + kw * w_dim1]) / d12);
a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] -
w[j + (kw - 1) * w_dim1]) / d12);
/* L20: */
}
}
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
a[k + k * a_dim1] = w[k + kw * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
sswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
jj = j - 1;
if (jp1 != jj && kstep == 2) {
i__1 = *n - j + 1;
sswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = *n - k + 1;
scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
}
absakk = (r__1 = w[k + k * w_dim1], dabs(r__1));
if (k < *n) {
i__1 = *n - k;
imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1));
} else {
colmax = 0.f;
}
if (dmax(absakk,colmax) == 0.f) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k
+ 1) * w_dim1], &c__1, (ftnlen)12);
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1));
} else {
rowmax = 0.f;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
stemp = (r__1 = w[itemp + (k + 1) * w_dim1], dabs(r__1));
if (stemp > rowmax) {
rowmax = stemp;
jmax = itemp;
}
}
if (! ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) <
alpha * rowmax)) {
kp = imax;
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p - k;
scopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
lda);
i__1 = *n - p + 1;
scopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
c__1);
sswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
sswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
a[kp + k * a_dim1] = a[kk + k * a_dim1];
i__1 = kp - k - 1;
scopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
* a_dim1], lda);
i__1 = *n - kp + 1;
scopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
a_dim1], &c__1);
sswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) {
r1 = 1.f / a[k + k * a_dim1];
i__1 = *n - k;
sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else if (a[k + k * a_dim1] != 0.f) {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
a[ii + k * a_dim1] /= a[k + k * a_dim1];
/* L74: */
}
}
}
} else {
if (k < *n - 1) {
d21 = w[k + 1 + k * w_dim1];
d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
d22 = w[k + k * w_dim1] / d21;
t = 1.f / (d11 * d22 - 1.f);
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[
j + (k + 1) * w_dim1]) / d21);
a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) *
w_dim1] - w[j + k * w_dim1]) / d21);
/* L80: */
}
}
a[k + k * a_dim1] = w[k + k * w_dim1];
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
sswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
jj = j + 1;
if (jp1 != jj && kstep == 2) {
sswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

274
relapack/src/stgsyl.c Normal file
View File

@ -0,0 +1,274 @@
#include "relapack.h"
#include <math.h>
static void RELAPACK_stgsyl_rec(const char *, const int *, const int *,
const int *, const float *, const int *, const float *, const int *,
float *, const int *, const float *, const int *, const float *,
const int *, float *, const int *, float *, float *, float *, int *, int *,
int *);
/** STGSYL solves the generalized Sylvester equation.
*
* This routine is functionally equivalent to LAPACK's stgsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html
* */
void RELAPACK_stgsyl(
const char *trans, const int *ijob, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dif,
float *Work, const int *lWork, int *iWork, int *info
) {
// Parse arguments
const int notran = LAPACK(lsame)(trans, "N");
const int tran = LAPACK(lsame)(trans, "T");
// Compute work buffer size
int lwmin = 1;
if (notran && (*ijob == 1 || *ijob == 2))
lwmin = MAX(1, 2 * *m * *n);
*info = 0;
// Check arguments
if (!tran && !notran)
*info = -1;
else if (notran && (*ijob < 0 || *ijob > 4))
*info = -2;
else if (*m <= 0)
*info = -3;
else if (*n <= 0)
*info = -4;
else if (*ldA < MAX(1, *m))
*info = -6;
else if (*ldB < MAX(1, *n))
*info = -8;
else if (*ldC < MAX(1, *m))
*info = -10;
else if (*ldD < MAX(1, *m))
*info = -12;
else if (*ldE < MAX(1, *n))
*info = -14;
else if (*ldF < MAX(1, *m))
*info = -16;
else if (*lWork < lwmin && *lWork != -1)
*info = -20;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("STGSYL", &minfo);
return;
}
if (*lWork == -1) {
// Work size query
*Work = lwmin;
return;
}
// Clean char * arguments
const char cleantrans = notran ? 'N' : 'T';
// Constant
const float ZERO[] = { 0. };
int isolve = 1;
int ifunc = 0;
if (notran) {
if (*ijob >= 3) {
ifunc = *ijob - 2;
LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF);
} else if (*ijob >= 1)
isolve = 2;
}
float scale2;
int iround;
for (iround = 1; iround <= isolve; iround++) {
*scale = 1;
float dscale = 0;
float dsum = 1;
int pq;
RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
if (dscale != 0) {
if (*ijob == 1 || *ijob == 3)
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
else
*dif = sqrt(pq) / (dscale * sqrt(dsum));
}
if (isolve == 2) {
if (iround == 1) {
if (notran)
ifunc = *ijob;
scale2 = *scale;
LAPACK(slacpy)("F", m, n, C, ldC, Work, m);
LAPACK(slacpy)("F", m, n, F, ldF, Work + *m * *n, m);
LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF);
} else {
LAPACK(slacpy)("F", m, n, Work, m, C, ldC);
LAPACK(slacpy)("F", m, n, Work + *m * *n, m, F, ldF);
*scale = scale2;
}
}
}
}
/** stgsyl's recursive vompute kernel */
static void RELAPACK_stgsyl_rec(
const char *trans, const int *ifunc, const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC,
const float *D, const int *ldD, const float *E, const int *ldE,
float *F, const int *ldF,
float *scale, float *dsum, float *dscale,
int *iWork, int *pq, int *info
) {
if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) {
// Unblocked
LAPACK(stgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const int iONE[] = { 1 };
// Outputs
float scale1[] = { 1. };
float scale2[] = { 1. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
int m1 = SREC_SPLIT(*m);
if (A[m1 + *ldA * (m1 - 1)])
m1++;
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const float *const A_TL = A;
const float *const A_TR = A + *ldA * m1;
const float *const A_BR = A + *ldA * m1 + m1;
// C_T
// C_B
float *const C_T = C;
float *const C_B = C + m1;
// D_TL D_TR
// 0 D_BR
const float *const D_TL = D;
const float *const D_TR = D + *ldD * m1;
const float *const D_BR = D + *ldD * m1 + m1;
// F_T
// F_B
float *const F_T = F;
float *const F_B = F + m1;
if (*trans == 'N') {
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1);
// C_T = C_T - A_TR * C_B
BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// F_T = F_T - D_TR * C_B
BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
}
} else {
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
// C_B = C_B - A_TR^H * C_T
BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// C_B = C_B - D_TR^H * F_T
BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
}
}
} else {
// Splitting
int n1 = SREC_SPLIT(*n);
if (B[n1 + *ldB * (n1 - 1)])
n1++;
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const float *const B_TL = B;
const float *const B_TR = B + *ldB * n1;
const float *const B_BR = B + *ldB * n1 + n1;
// C_L C_R
float *const C_L = C;
float *const C_R = C + *ldC * n1;
// E_TL E_TR
// 0 E_BR
const float *const E_TL = E;
const float *const E_TR = E + *ldE * n1;
const float *const E_BR = E + *ldE * n1 + n1;
// F_L F_R
float *const F_L = F;
float *const F_R = F + *ldF * n1;
if (*trans == 'N') {
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1);
// C_R = C_R + F_L * B_TR
BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
// F_R = F_R + F_L * E_TR
BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
}
} else {
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
// F_L = F_L + C_R * B_TR
BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
// F_L = F_L + F_R * E_TR
BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
}
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

169
relapack/src/strsyl.c Normal file
View File

@ -0,0 +1,169 @@
#include "relapack.h"
static void RELAPACK_strsyl_rec(const char *, const char *, const int *,
const int *, const int *, const float *, const int *, const float *,
const int *, float *, const int *, float *, int *);
/** STRSYL solves the real Sylvester matrix equation.
*
* This routine is functionally equivalent to LAPACK's strsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html
* */
void RELAPACK_strsyl(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
// Check arguments
const int notransA = LAPACK(lsame)(tranA, "N");
const int transA = LAPACK(lsame)(tranA, "T");
const int ctransA = LAPACK(lsame)(tranA, "C");
const int notransB = LAPACK(lsame)(tranB, "N");
const int transB = LAPACK(lsame)(tranB, "T");
const int ctransB = LAPACK(lsame)(tranB, "C");
*info = 0;
if (!transA && !ctransA && !notransA)
*info = -1;
else if (!transB && !ctransB && !notransB)
*info = -2;
else if (*isgn != 1 && *isgn != -1)
*info = -3;
else if (*m < 0)
*info = -4;
else if (*n < 0)
*info = -5;
else if (*ldA < MAX(1, *m))
*info = -7;
else if (*ldB < MAX(1, *n))
*info = -9;
else if (*ldC < MAX(1, *m))
*info = -11;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("STRSYL", &minfo);
return;
}
// Clean char * arguments
const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C');
const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C');
// Recursive kernel
RELAPACK_strsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
/** strsyl's recursive compute kernel */
static void RELAPACK_strsyl_rec(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const float *A, const int *ldA, const float *B, const int *ldB,
float *C, const int *ldC, float *scale,
int *info
) {
if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) {
// Unblocked
RELAPACK_strsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
const float MSGN[] = { -*isgn };
const int iONE[] = { 1 };
// Outputs
float scale1[] = { 1. };
float scale2[] = { 1. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
int m1 = SREC_SPLIT(*m);
if (A[m1 + *ldA * (m1 - 1)])
m1++;
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const float *const A_TL = A;
const float *const A_TR = A + *ldA * m1;
const float *const A_BR = A + *ldA * m1 + m1;
// C_T
// C_B
float *const C_T = C;
float *const C_B = C + m1;
if (*tranA == 'N') {
// recusion(A_BR, B, C_B)
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
// C_T = C_T - A_TR * C_B
BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// recusion(A_TL, B, C_T)
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
} else {
// recusion(A_TL, B, C_T)
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
// C_B = C_B - A_TR' * C_T
BLAS(sgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// recusion(A_BR, B, C_B)
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
}
} else {
// Splitting
int n1 = SREC_SPLIT(*n);
if (B[n1 + *ldB * (n1 - 1)])
n1++;
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const float *const B_TL = B;
const float *const B_TR = B + *ldB * n1;
const float *const B_BR = B + *ldB * n1 + n1;
// C_L C_R
float *const C_L = C;
float *const C_R = C + *ldC * n1;
if (*tranB == 'N') {
// recusion(A, B_TL, C_L)
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
// C_R = C_R -/+ C_L * B_TR
BLAS(sgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
// recusion(A, B_BR, C_R)
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
} else {
// recusion(A, B_BR, C_R)
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
// C_L = C_L -/+ C_R * B_TR'
BLAS(sgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
// recusion(A, B_TL, C_L)
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

1029
relapack/src/strsyl_rec2.c Normal file

File diff suppressed because it is too large Load Diff

107
relapack/src/strtri.c Normal file
View File

@ -0,0 +1,107 @@
#include "relapack.h"
static void RELAPACK_strtri_rec(const char *, const char *, const int *,
float *, const int *, int *);
/** CTRTRI computes the inverse of a real upper or lower triangular matrix A.
*
* This routine is functionally equivalent to LAPACK's strtri.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html
* */
void RELAPACK_strtri(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int nounit = LAPACK(lsame)(diag, "N");
const int unit = LAPACK(lsame)(diag, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (!nounit && !unit)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("STRTRI", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleandiag = nounit ? 'N' : 'U';
// check for singularity
if (nounit) {
int i;
for (i = 0; i < *n; i++)
if (A[i + *ldA * i] == 0) {
*info = i;
return;
}
}
// Recursive kernel
RELAPACK_strtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
}
/** strtri's recursive compute kernel */
static void RELAPACK_strtri_rec(
const char *uplo, const char *diag, const int *n,
float *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_STRTRI, 1)) {
// Unblocked
LAPACK(strti2)(uplo, diag, n, A, ldA, info);
return;
}
// Constants
const float ONE[] = { 1. };
const float MONE[] = { -1. };
// Splitting
const int n1 = SREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
float *const A_TL = A;
float *const A_TR = A + *ldA * n1;
float *const A_BL = A + n1;
float *const A_BR = A + *ldA * n1 + n1;
// recursion(A_TL)
RELAPACK_strtri_rec(uplo, diag, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = - A_BL * A_TL
BLAS(strmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
// A_BL = A_BR \ A_BL
BLAS(strsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TR = - A_TL * A_TR
BLAS(strmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
// A_TR = A_TR / A_BR
BLAS(strsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_strtri_rec(uplo, diag, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

230
relapack/src/zgbtrf.c Normal file
View File

@ -0,0 +1,230 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *,
const int *, double *, const int *, int *, double *, const int *, double *,
const int *, int *);
/** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's zgbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html
* */
void RELAPACK_zgbtrf(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kl < 0)
*info = -3;
else if (*ku < 0)
*info = -4;
else if (*ldAb < 2 * *kl + *ku + 1)
*info = -6;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZGBTRF", &minfo);
return;
}
// Constant
const double ZERO[] = { 0., 0. };
// Result upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + 2 * kv;
// Zero upper diagonal fill-in elements
int i, j;
for (j = 0; j < *n; j++) {
double *const A_j = A + 2 * *ldA * j;
for (i = MAX(0, j - kv); i < j - *ku; i++)
A_j[2 * i] = A_j[2 * i + 1] = 0.;
}
// Allocate work space
const int n1 = ZREC_SPLIT(*n);
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const int nWorkl = (kv > n1) ? n1 : kv;
const int mWorku = (*kl > n1) ? n1 : *kl;
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double));
double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double));
LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(zlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_zgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
// Free work space
free(Workl);
free(Worku);
}
/** zgbtrf's recursive compute kernel */
static void RELAPACK_zgbtrf_rec(
const int *m, const int *n, const int *kl, const int *ku,
double *Ab, const int *ldAb, int *ipiv,
double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
int *info
) {
if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) {
// Unblocked
LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Loop iterators
int i, j;
// Output upper band width
const int kv = *ku + *kl;
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + 2 * kv;
// Splitting
const int n1 = MIN(ZREC_SPLIT(*n), *kl);
const int n2 = *n - n1;
const int m1 = MIN(n1, *m);
const int m2 = *m - m1;
const int mn1 = MIN(m1, n1);
const int mn2 = MIN(m2, n2);
// Ab_L *
// Ab_BR
double *const Ab_L = Ab;
double *const Ab_BR = Ab + 2 * *ldAb * n1;
// A_L A_R
double *const A_L = A;
double *const A_R = A + 2 * *ldA * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * m1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * m1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// Banded splitting
const int n21 = MIN(n2, kv - n1);
const int n22 = MIN(n2 - n21, n1);
const int m21 = MIN(m2, *kl - m1);
const int m22 = MIN(m2 - m21, m1);
// n1 n21 n22
// m * A_Rl ARr
double *const A_Rl = A_R;
double *const A_Rr = A_R + 2 * *ldA * n21;
// n1 n21 n22
// m1 * A_TRl A_TRr
// m21 A_BLt A_BRtl A_BRtr
// m22 A_BLb A_BRbl A_BRbr
double *const A_TRl = A_TR;
double *const A_TRr = A_TR + 2 * *ldA * n21;
double *const A_BLt = A_BL;
double *const A_BLb = A_BL + 2 * m21;
double *const A_BRtl = A_BR;
double *const A_BRtr = A_BR + 2 * *ldA * n21;
double *const A_BRbl = A_BR + 2 * m21;
double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
// Workl = A_BLb
LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
// partially redo swaps in A_L
for (i = 0; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
else
BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
}
}
// apply pivots to A_Rl
LAPACK(zlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
// apply pivots to A_Rr columnwise
for (j = 0; j < n22; j++) {
double *const A_Rrj = A_Rr + 2 * *ldA * j;
for (i = j; i < mn1; i++) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
const double tmpr = A_Rrj[2 * i];
const double tmpc = A_Rrj[2 * i + 1];
A_Rrj[2 * i] = A_Rrj[2 * ip];
A_Rrj[2 * i + 1] = A_Rrj[2 * ip + 1];
A_Rrj[2 * ip] = tmpr;
A_Rrj[2 * ip + 1] = tmpc;
}
}
}
// A_TRl = A_TL \ A_TRl
BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// Worku = A_TRr
LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
// Worku = A_TL \ Worku
BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
// A_TRr = Worku
LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
// A_BRtl = A_BRtl - A_BLt * A_TRl
BLAS(zgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
// A_BRbl = A_BRbl - Workl * A_TRl
BLAS(zgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
// A_BRtr = A_BRtr - A_BLt * Worku
BLAS(zgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Workl * Worku
BLAS(zgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
// partially undo swaps in A_L
for (i = mn1 - 1; i >= 0; i--) {
const int ip = ipiv_T[i] - 1;
if (ip != i) {
if (ip < *kl)
BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
else
BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
if (*info)
*info += n1;
// shift pivots
for (i = 0; i < mn2; i++)
ipiv_B[i] += n1;
}

167
relapack/src/zgemmt.c Normal file
View File

@ -0,0 +1,167 @@
#include "relapack.h"
static void RELAPACK_zgemmt_rec(const char *, const char *, const char *,
const int *, const int *, const double *, const double *, const int *,
const double *, const int *, const double *, double *, const int *);
static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *,
const int *, const int *, const double *, const double *, const int *,
const double *, const int *, const double *, double *, const int *);
/** ZGEMMT computes a matrix-matrix product with general matrices but updates
* only the upper or lower triangular part of the result matrix.
*
* This routine performs the same operation as the BLAS routine
* zgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
* but only updates the triangular part of C specified by uplo:
* If (*uplo == 'L'), only the lower triangular part of C is updated,
* otherwise the upper triangular part is updated.
* */
void RELAPACK_zgemmt(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
#if HAVE_XGEMMT
BLAS(zgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
#else
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int notransA = LAPACK(lsame)(transA, "N");
const int tranA = LAPACK(lsame)(transA, "T");
const int ctransA = LAPACK(lsame)(transA, "C");
const int notransB = LAPACK(lsame)(transB, "N");
const int tranB = LAPACK(lsame)(transB, "T");
const int ctransB = LAPACK(lsame)(transB, "C");
int info = 0;
if (!lower && !upper)
info = 1;
else if (!tranA && !ctransA && !notransA)
info = 2;
else if (!tranB && !ctransB && !notransB)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*ldA < MAX(1, notransA ? *n : *k))
info = 8;
else if (*ldB < MAX(1, notransB ? *k : *n))
info = 10;
else if (*ldC < MAX(1, *n))
info = 13;
if (info) {
LAPACK(xerbla)("ZGEMMT", &info);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');
// Recursive kernel
RELAPACK_zgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
#endif
}
/** zgemmt's recursive compute kernel */
static void RELAPACK_zgemmt_rec(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) {
// Unblocked
RELAPACK_zgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
return;
}
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// A_T
// A_B
const double *const A_T = A;
const double *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);
// B_L B_R
const double *const B_L = B;
const double *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);
// C_TL C_TR
// C_BL C_BR
double *const C_TL = C;
double *const C_TR = C + 2 * *ldC * n1;
double *const C_BL = C + 2 * n1;
double *const C_BR = C + 2 * *ldC * n1 + 2 * n1;
// recursion(C_TL)
RELAPACK_zgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
if (*uplo == 'L')
// C_BL = alpha A_B B_L + beta C_BL
BLAS(zgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
else
// C_TR = alpha A_T B_R + beta C_TR
BLAS(zgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
// recursion(C_BR)
RELAPACK_zgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
}
/** zgemmt's unblocked compute kernel */
static void RELAPACK_zgemmt_rec2(
const char *uplo, const char *transA, const char *transB,
const int *n, const int *k,
const double *alpha, const double *A, const int *ldA,
const double *B, const int *ldB,
const double *beta, double *C, const int *ldC
) {
const int incB = (*transB == 'N') ? 1 : *ldB;
const int incC = 1;
int i;
for (i = 0; i < *n; i++) {
// A_0
// A_i
const double *const A_0 = A;
const double *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);
// * B_i *
const double *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);
// * C_0i *
// * C_ii *
double *const C_0i = C + 2 * *ldC * i;
double *const C_ii = C + 2 * *ldC * i + 2 * i;
if (*uplo == 'L') {
const int nmi = *n - i;
if (*transA == 'N')
BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
else
BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
} else {
const int ip1 = i + 1;
if (*transA == 'N')
BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
else
BLAS(zgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
}
}
}

117
relapack/src/zgetrf.c Normal file
View File

@ -0,0 +1,117 @@
#include "relapack.h"
static void RELAPACK_zgetrf_rec(const int *, const int *, double *,
const int *, int *, int *);
/** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
*
* This routine is functionally equivalent to LAPACK's zgetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html
* */
void RELAPACK_zgetrf(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZGETRF", &minfo);
return;
}
const int sn = MIN(*m, *n);
RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
if (*m < *n) {
// Constants
const double ONE[] = { 1., 0. };
const int iONE[] = { 1 };
// Splitting
const int rn = *n - *m;
// A_L A_R
const double *const A_L = A;
double *const A_R = A + 2 * *ldA * *m;
// A_R = apply(ipiv, A_R)
LAPACK(zlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
// A_R = A_L \ A_R
BLAS(ztrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
}
}
/** zgetrf's recursive compute kernel */
static void RELAPACK_zgetrf_rec(
const int *m, const int *n,
double *A, const int *ldA, int *ipiv,
int *info
) {
if (*n <= MAX(CROSSOVER_ZGETRF, 1)) {
// Unblocked
LAPACK(zgetf2)(m, n, A, ldA, ipiv, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1. };
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
const int m2 = *m - n1;
// A_L A_R
double *const A_L = A;
double *const A_R = A + 2 * *ldA * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// ipiv_T
// ipiv_B
int *const ipiv_T = ipiv;
int *const ipiv_B = ipiv + n1;
// recursion(A_L, ipiv_T)
RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
// apply pivots to A_R
LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
// A_TR = A_TL \ A_TR
BLAS(ztrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_BL * A_TR
BLAS(zgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
// recursion(A_BR, ipiv_B)
RELAPACK_zgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
if (*info)
*info += n1;
// apply pivots to A_BL
LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
// shift pivots
int i;
for (i = 0; i < n2; i++)
ipiv_B[i] += n1;
}

212
relapack/src/zhegst.c Normal file
View File

@ -0,0 +1,212 @@
#include "relapack.h"
#if XSYGST_ALLOW_MALLOC
#include "stdlib.h"
#endif
static void RELAPACK_zhegst_rec(const int *, const char *, const int *,
double *, const int *, const double *, const int *,
double *, const int *, int *);
/** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
*
* This routine is functionally equivalent to LAPACK's zhegst.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html
* */
void RELAPACK_zhegst(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (*itype < 1 || *itype > 3)
*info = -1;
else if (!lower && !upper)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
else if (*ldB < MAX(1, *n))
*info = -7;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZHEGST", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Allocate work space
double *Work = NULL;
int lWork = 0;
#if XSYGST_ALLOW_MALLOC
const int n1 = ZREC_SPLIT(*n);
lWork = n1 * (*n - n1);
Work = malloc(lWork * 2 * sizeof(double));
if (!Work)
lWork = 0;
#endif
// recursive kernel
RELAPACK_zhegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
// Free work space
#if XSYGST_ALLOW_MALLOC
if (Work)
free(Work);
#endif
}
/** zhegst's recursive compute kernel */
static void RELAPACK_zhegst_rec(
const int *itype, const char *uplo, const int *n,
double *A, const int *ldA, const double *B, const int *ldB,
double *Work, const int *lWork, int *info
) {
if (*n <= MAX(CROSSOVER_ZHEGST, 1)) {
// Unblocked
LAPACK(zhegs2)(itype, uplo, n, A, ldA, B, ldB, info);
return;
}
// Constants
const double ZERO[] = { 0., 0. };
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const double HALF[] = { .5, 0. };
const double MHALF[] = { -.5, 0. };
const int iONE[] = { 1 };
// Loop iterator
int i;
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// B_TL B_TR
// B_BL B_BR
const double *const B_TL = B;
const double *const B_TR = B + 2 * *ldB * n1;
const double *const B_BL = B + 2 * n1;
const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// recursion(A_TL, B_TL)
RELAPACK_zhegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
if (*itype == 1)
if (*uplo == 'L') {
// A_BL = A_BL / B_TL'
BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork >= n2 * n1) {
// T = -1/2 * B_BL * A_TL
BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
} else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
BLAS(zher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
if (*lWork >= n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
else
// A_BL = A_BL - 1/2 B_BL * A_TL
BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR \ A_BL
BLAS(ztrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL' \ A_TR
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork >= n2 * n1) {
// T = -1/2 * A_TL * B_TR
BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
} else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
BLAS(zher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
if (*lWork >= n2 * n1)
// A_TR = A_BL + T
for (i = 0; i < n2; i++)
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
else
// A_TR = A_TR - 1/2 A_TL * B_TR
BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR / B_BR
BLAS(ztrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
else
if (*uplo == 'L') {
// A_BL = A_BL * B_TL
BLAS(ztrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
if (*lWork >= n2 * n1) {
// T = 1/2 * A_BR * B_BL
BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
} else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
BLAS(zher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
if (*lWork >= n2 * n1)
// A_BL = A_BL + T
for (i = 0; i < n1; i++)
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
else
// A_BL = A_BL + 1/2 A_BR * B_BL
BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
// A_BL = B_BR * A_BL
BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
} else {
// A_TR = B_TL * A_TR
BLAS(ztrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
if (*lWork >= n2 * n1) {
// T = 1/2 * B_TR * A_BR
BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
} else
// A_TR = A_TR + 1/2 B_TR A_BR
BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
BLAS(zher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
if (*lWork >= n2 * n1)
// A_TR = A_TR + T
for (i = 0; i < n2; i++)
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
else
// A_TR = A_TR + 1/2 B_TR * A_BR
BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
// A_TR = A_TR * B_BR
BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
}
// recursion(A_BR, B_BR)
RELAPACK_zhegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
}

236
relapack/src/zhetrf.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's zhetrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html
* */
void RELAPACK_zhetrf(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZHETRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** zhetrf's recursive compute kernel */
static void RELAPACK_zhetrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_ZHETRF, 3)) {
// Unblocked
if (top) {
LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_zhetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = ZREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
double *const A_BL_B = A + 2 * *n;
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + 2 * n1;
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = ZREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + 2 * *ldA * n_rest;
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

524
relapack/src/zhetrf_rec2.c Normal file
View File

@ -0,0 +1,524 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static doublecomplex c_b1 = {1.,0.};
static int c__1 = 1;
/** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
*
* This routine is a minor modification of LAPACK's zlahef.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int *
nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
double d__1, d__2, d__3, d__4;
doublecomplex z__1, z__2, z__3, z__4;
/* Builtin functions */
double sqrt(double), d_imag(doublecomplex *);
void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *,
doublecomplex *, doublecomplex *);
/* Local variables */
static int j, k;
static double t, r1;
static doublecomplex d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static double alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
static int kstep;
extern /* Subroutine */ int zgemv_(char *, int *, int *,
doublecomplex *, doublecomplex *, int *, doublecomplex *,
int *, doublecomplex *, doublecomplex *, int *, ftnlen),
zcopy_(int *, doublecomplex *, int *, doublecomplex *,
int *), zswap_(int *, doublecomplex *, int *,
doublecomplex *, int *);
static double absakk;
extern /* Subroutine */ int zdscal_(int *, double *,
doublecomplex *, int *);
static double colmax;
extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
;
extern int izamax_(int *, doublecomplex *, int *);
static double rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
i__1 = k - 1;
zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k + kw * w_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
i__1 = k + kw * w_dim1;
i__2 = k + kw * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
i__1 = k + kw * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1));
if (k > 1) {
i__1 = k - 1;
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
kw * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - 1;
zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + imax * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
i__1 = k - imax;
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
i__1 = k - imax;
zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + (kw - 1) * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
i__1 = k - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
jmax + (kw - 1) * w_dim1]), abs(d__2));
if (imax > 1) {
i__1 = imax - 1;
jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
i__1 = jmax + (kw - 1) * w_dim1;
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs(
d__2));
rowmax = max(d__3,d__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (kw - 1) * w_dim1;
if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) {
kp = imax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = kk - 1 - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
i__1 = kk - 1 - kp;
zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
r1 = 1. / a[i__1].r;
i__1 = k - 1;
zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
i__1 = k - 1;
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
d_cnjg(&z__2, &d21);
z_div(&z__1, &w[k + kw * w_dim1], &z__2);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = z__1.r, d22.i = z__1.i;
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1. / (z__1.r - 1.);
z__2.r = t, z__2.i = 0.;
z_div(&z__1, &z__2, &d21);
d21.r = z__1.r, d21.i = z__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + k * a_dim1;
d_cnjg(&z__2, &d21);
i__3 = j + kw * w_dim1;
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
z__2.r * z__3.i + z__2.i * z__3.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1;
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k - 2;
zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
i__1 = k + k * w_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (k < *n) {
i__1 = *n - k;
zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
w_dim1], &c__1);
}
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
i__1 = k + k * w_dim1;
i__2 = k + k * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
i__1 = k + k * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1));
if (k < *n) {
i__1 = *n - k;
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
k * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = imax - k;
zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + imax * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (imax < *n) {
i__1 = *n - imax;
zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
imax + 1 + (k + 1) * w_dim1], &c__1);
}
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + (k + 1) * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
i__1 = imax - k;
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
jmax + (k + 1) * w_dim1]), abs(d__2));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
i__1 = jmax + (k + 1) * w_dim1;
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs(
d__2));
rowmax = max(d__3,d__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (k + 1) * w_dim1;
if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) {
kp = imax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
k * w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k + kstep - 1;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = kp - kk - 1;
zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
i__1 = kp - kk - 1;
zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
r1 = 1. / a[i__1].r;
i__1 = *n - k;
zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
i__1 = *n - k;
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = z__1.r, d11.i = z__1.i;
d_cnjg(&z__2, &d21);
z_div(&z__1, &w[k + k * w_dim1], &z__2);
d22.r = z__1.r, d22.i = z__1.i;
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1. / (z__1.r - 1.);
z__2.r = t, z__2.i = 0.;
z_div(&z__1, &z__2, &d21);
d21.r = z__1.r, d21.i = z__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
d_cnjg(&z__2, &d21);
i__3 = j + k * w_dim1;
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
z__2.r * z__3.i + z__2.i * z__3.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = *n - k;
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = *n - k - 1;
zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/zhetrf_rook.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's zhetrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html
* */
void RELAPACK_zhetrf_rook(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZHETRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** zhetrf_rook's recursive compute kernel */
static void RELAPACK_zhetrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_zhetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = ZREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
double *const A_BL_B = A + 2 * *n;
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + 2 * n1;
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = ZREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + 2 * *ldA * n_rest;
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@ -0,0 +1,662 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static doublecomplex c_b1 = {1.,0.};
static int c__1 = 1;
/** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
*
* This routine is a minor modification of LAPACK's zlahef_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, doublecomplex *a, int *lda, int *
ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
double d__1, d__2;
doublecomplex z__1, z__2, z__3, z__4, z__5;
/* Builtin functions */
double sqrt(double), d_imag(doublecomplex *);
void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *,
doublecomplex *, doublecomplex *);
/* Local variables */
static int j, k, p;
static double t, r1;
static doublecomplex d11, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static double alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
static double dtemp, sfmin;
static int itemp, kstep;
extern /* Subroutine */ int zgemv_(char *, int *, int *,
doublecomplex *, doublecomplex *, int *, doublecomplex *,
int *, doublecomplex *, doublecomplex *, int *, ftnlen),
zcopy_(int *, doublecomplex *, int *, doublecomplex *,
int *), zswap_(int *, doublecomplex *, int *,
doublecomplex *, int *);
extern double dlamch_(char *, ftnlen);
static double absakk;
extern /* Subroutine */ int zdscal_(int *, double *,
doublecomplex *, int *);
static double colmax;
extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
;
extern int izamax_(int *, doublecomplex *, int *);
static double rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
sfmin = dlamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
if (k > 1) {
i__1 = k - 1;
zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
c__1);
}
i__1 = k + kw * w_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
i__1 = k + kw * w_dim1;
i__2 = k + kw * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
i__1 = k + kw * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1));
if (k > 1) {
i__1 = k - 1;
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
kw * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
d__1 = w[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
if (k > 1) {
i__1 = k - 1;
zcopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
&c__1);
}
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
if (imax > 1) {
i__1 = imax - 1;
zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
}
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + imax * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
i__1 = k - imax;
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
i__1 = k - imax;
zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
i__1 = imax + (kw - 1) * w_dim1;
i__2 = imax + (kw - 1) * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
w[jmax + (kw - 1) * w_dim1]), abs(d__2));
} else {
rowmax = 0.;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
i__1 = itemp + (kw - 1) * w_dim1;
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
itemp + (kw - 1) * w_dim1]), abs(d__2));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
i__1 = imax + (kw - 1) * w_dim1;
if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) {
kp = imax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = p + p * a_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = k - 1 - p;
zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
i__1 = k - 1 - p;
zlacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
if (p > 1) {
i__1 = p - 1;
zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
zswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = kk - 1 - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
i__1 = kk - 1 - kp;
zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
t = a[i__1].r;
if (abs(t) >= sfmin) {
r1 = 1. / t;
i__1 = k - 1;
zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
i__3 = ii + k * a_dim1;
z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L14: */
}
}
i__1 = k - 1;
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
d_cnjg(&z__2, &d21);
z_div(&z__1, &w[k + kw * w_dim1], &z__2);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = z__1.r, d22.i = z__1.i;
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1. / (z__1.r - 1.);
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d21);
z__1.r = t * z__2.r, z__1.i = t * z__2.i;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
d_cnjg(&z__5, &d21);
z_div(&z__2, &z__3, &z__5);
z__1.r = t * z__2.r, z__1.i = t * z__2.i;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1;
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = k - 2;
zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
++jj;
if (kstep == 2 && jp1 != jj && j <= *n) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = k + k * w_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (k < *n) {
i__1 = *n - k;
zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
w_dim1], &c__1);
}
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
i__1 = k + k * w_dim1;
i__2 = k + k * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
i__1 = k + k * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1));
if (k < *n) {
i__1 = *n - k;
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
k * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
d__1 = w[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
if (k < *n) {
i__1 = *n - k;
zcopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
a_dim1], &c__1);
}
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = imax - k;
zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + imax * a_dim1;
d__1 = a[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
if (imax < *n) {
i__1 = *n - imax;
zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
imax + 1 + (k + 1) * w_dim1], &c__1);
}
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
1) * w_dim1], &c__1, (ftnlen)12);
i__1 = imax + (k + 1) * w_dim1;
i__2 = imax + (k + 1) * w_dim1;
d__1 = w[i__2].r;
w[i__1].r = d__1, w[i__1].i = 0.;
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
w[jmax + (k + 1) * w_dim1]), abs(d__2));
} else {
rowmax = 0.;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
i__1 = itemp + (k + 1) * w_dim1;
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
itemp + (k + 1) * w_dim1]), abs(d__2));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
i__1 = imax + (k + 1) * w_dim1;
if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) {
kp = imax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p + p * a_dim1;
i__2 = k + k * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = p - k - 1;
zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
a_dim1], lda);
i__1 = p - k - 1;
zlacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
if (p < *n) {
i__1 = *n - p;
zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
* a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
zswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
}
zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
d__1 = a[i__2].r;
a[i__1].r = d__1, a[i__1].i = 0.;
i__1 = kp - kk - 1;
zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
i__1 = kp - kk - 1;
zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
t = a[i__1].r;
if (abs(t) >= sfmin) {
r1 = 1. / t;
i__1 = *n - k;
zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
i__3 = ii + k * a_dim1;
z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L74: */
}
}
i__1 = *n - k;
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = z__1.r, d11.i = z__1.i;
d_cnjg(&z__2, &d21);
z_div(&z__1, &w[k + k * w_dim1], &z__2);
d22.r = z__1.r, d22.i = z__1.i;
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
d22.i + d11.i * d22.r;
t = 1. / (z__1.r - 1.);
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
d_cnjg(&z__5, &d21);
z_div(&z__2, &z__3, &z__5);
z__1.r = t * z__2.r, z__1.i = t * z__2.i;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d21);
z__1.r = t * z__2.r, z__1.i = t * z__2.i;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = *n - k;
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = *n - k - 1;
zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
--jj;
if (kstep == 2 && jp1 != jj && j >= 1) {
zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

87
relapack/src/zlauum.c Normal file
View File

@ -0,0 +1,87 @@
#include "relapack.h"
static void RELAPACK_zlauum_rec(const char *, const int *, double *,
const int *, int *);
/** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
*
* This routine is functionally equivalent to LAPACK's zlauum.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html
* */
void RELAPACK_zlauum(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZLAUUM", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_zlauum_rec(&cleanuplo, n, A, ldA, info);
}
/** zlauum's recursive compute kernel */
static void RELAPACK_zlauum_rec(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) {
// Unblocked
LAPACK(zlauu2)(uplo, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_zlauum_rec(uplo, &n1, A_TL, ldA, info);
if (*uplo == 'L') {
// A_TL = A_TL + A_BL' * A_BL
BLAS(zherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
// A_BL = A_BR' * A_BL
BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TL = A_TL + A_TR * A_TR'
BLAS(zherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
// A_TR = A_TR * A_BR'
BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_zlauum_rec(uplo, &n2, A_BR, ldA, info);
}

157
relapack/src/zpbtrf.c Normal file
View File

@ -0,0 +1,157 @@
#include "relapack.h"
#include "stdlib.h"
static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *,
double *, const int *, double *, const int *, int *);
/** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
*
* This routine is functionally equivalent to LAPACK's zpbtrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html
* */
void RELAPACK_zpbtrf(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*kd < 0)
*info = -3;
else if (*ldAb < *kd + 1)
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZPBTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Constant
const double ZERO[] = { 0., 0. };
// Allocate work space
const int n1 = ZREC_SPLIT(*n);
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
double *Work = malloc(mWork * nWork * 2 * sizeof(double));
LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
// Recursive kernel
RELAPACK_zpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
// Free work space
free(Work);
}
/** zpbtrf's recursive compute kernel */
static void RELAPACK_zpbtrf_rec(
const char *uplo, const int *n, const int *kd,
double *Ab, const int *ldAb,
double *Work, const int *ldWork,
int *info
){
if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) {
// Unblocked
LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
// Unskew A
const int ldA[] = { *ldAb - 1 };
double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
// Splitting
const int n1 = MIN(ZREC_SPLIT(*n), *kd);
const int n2 = *n - n1;
// * *
// * Ab_BR
double *const Ab_BR = Ab + 2 * *ldAb * n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_zpotrf(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
// Banded splitting
const int n21 = MIN(n2, *kd - n1);
const int n22 = MIN(n2 - n21, *kd);
// n1 n21 n22
// n1 * A_TRl A_TRr
// n21 A_BLt A_BRtl A_BRtr
// n22 A_BLb A_BRbl A_BRbr
double *const A_TRl = A_TR;
double *const A_TRr = A_TR + 2 * *ldA * n21;
double *const A_BLt = A_BL;
double *const A_BLb = A_BL + 2 * n21;
double *const A_BRtl = A_BR;
double *const A_BRtr = A_BR + 2 * *ldA * n21;
double *const A_BRbl = A_BR + 2 * n21;
double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21;
if (*uplo == 'L') {
// A_BLt = ABLt / A_TL'
BLAS(ztrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
// A_BRtl = A_BRtl - A_BLt * A_BLt'
BLAS(zherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
// Work = A_BLb
LAPACK(zlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
// Work = Work / A_TL'
BLAS(ztrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
// A_BRbl = A_BRbl - Work * A_BLt'
BLAS(zgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
// A_BRbr = A_BRbr - Work * Work'
BLAS(zherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_BLb = Work
LAPACK(zlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
} else {
// A_TRl = A_TL' \ A_TRl
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
// A_BRtl = A_BRtl - A_TRl' * A_TRl
BLAS(zherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
// Work = A_TRr
LAPACK(zlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
// Work = A_TL' \ Work
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
// A_BRtr = A_BRtr - A_TRl' * Work
BLAS(zgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
// A_BRbr = A_BRbr - Work' * Work
BLAS(zherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
// A_TRr = Work
LAPACK(zlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
}
// recursion(A_BR)
if (*kd > n1)
RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info);
else
RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
if (*info)
*info += n1;
}

92
relapack/src/zpotrf.c Normal file
View File

@ -0,0 +1,92 @@
#include "relapack.h"
static void RELAPACK_zpotrf_rec(const char *, const int *, double *,
const int *, int *);
/** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
*
* This routine is functionally equivalent to LAPACK's zpotrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html
* */
void RELAPACK_zpotrf(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZPOTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Recursive kernel
RELAPACK_zpotrf_rec(&cleanuplo, n, A, ldA, info);
}
/** zpotrf's recursive compute kernel */
static void RELAPACK_zpotrf_rec(
const char *uplo, const int *n,
double *A, const int *ldA,
int *info
) {
if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) {
// Unblocked
LAPACK(zpotf2)(uplo, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_zpotrf_rec(uplo, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = A_BL / A_TL'
BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
// A_BR = A_BR - A_BL * A_BL'
BLAS(zherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
} else {
// A_TR = A_TL' \ A_TR
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
// A_BR = A_BR - A_TR' * A_TR
BLAS(zherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
}
// recursion(A_BR)
RELAPACK_zpotrf_rec(uplo, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}

238
relapack/src/zsytrf.c Normal file
View File

@ -0,0 +1,238 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's zsytrf.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html
* */
void RELAPACK_zsytrf(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy arguments
int nout;
// Recursive kernel
RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** zsytrf's recursive compute kernel */
static void RELAPACK_zsytrf_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) {
// Unblocked
if (top) {
LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_zsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Loop iterator
int i;
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = ZREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
double *const A_BL_B = A + 2 * *n;
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + 2 * n1;
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = ZREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + 2 * *ldA * n_rest;
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

452
relapack/src/zsytrf_rec2.c Normal file
View File

@ -0,0 +1,452 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static doublecomplex c_b1 = {1.,0.};
static int c__1 = 1;
/** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
*
* This routine is a minor modification of LAPACK's zlasyf.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int *
nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
double d__1, d__2, d__3, d__4;
doublecomplex z__1, z__2, z__3;
/* Builtin functions */
double sqrt(double), d_imag(doublecomplex *);
void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
/* Local variables */
static int j, k;
static doublecomplex t, r1, d11, d21, d22;
static int jj, kk, jp, kp, kw, kkw, imax, jmax;
static double alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int zscal_(int *, doublecomplex *,
doublecomplex *, int *);
static int kstep;
extern /* Subroutine */ int zgemv_(char *, int *, int *,
doublecomplex *, doublecomplex *, int *, doublecomplex *,
int *, doublecomplex *, doublecomplex *, int *, ftnlen),
zcopy_(int *, doublecomplex *, int *, doublecomplex *,
int *), zswap_(int *, doublecomplex *, int *,
doublecomplex *, int *);
static double absakk, colmax;
extern int izamax_(int *, doublecomplex *, int *);
static double rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
kstep = 1;
i__1 = k + kw * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw *
w_dim1]), abs(d__2));
if (k > 1) {
i__1 = k - 1;
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
kw * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
i__1 = k - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
&c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
jmax + (kw - 1) * w_dim1]), abs(d__2));
if (imax > 1) {
i__1 = imax - 1;
jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
/* Computing MAX */
i__1 = jmax + (kw - 1) * w_dim1;
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs(
d__2));
rowmax = max(d__3,d__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (kw - 1) * w_dim1;
if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
imax + (kw - 1) * w_dim1]), abs(d__2)) >= alpha *
rowmax) {
kp = imax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kk - 1 - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
if (kp > 1) {
i__1 = kp - 1;
zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ 1], &c__1);
}
if (k < *n) {
i__1 = *n - k;
zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ 1) * a_dim1], lda);
}
i__1 = *n - kk + 1;
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
r1.r = z__1.r, r1.i = z__1.i;
i__1 = k - 1;
zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
z_div(&z__1, &w[k + kw * w_dim1], &d21);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
d22.r = z__1.r, d22.i = z__1.i;
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
d22.i + d11.i * d22.r;
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
z_div(&z__1, &c_b1, &z__2);
t.r = z__1.r, t.i = z__1.i;
z_div(&z__1, &t, &d21);
d21.r = z__1.r, d21.i = z__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
++j;
}
++j;
if (jp != jj && j <= *n) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
}
if (j < *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
i__1 = *n - k + 1;
zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
kstep = 1;
i__1 = k + k * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k *
w_dim1]), abs(d__2));
if (k < *n) {
i__1 = *n - k;
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
k * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
} else {
if (absakk >= alpha * colmax) {
kp = k;
} else {
i__1 = imax - k;
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1],
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
w_dim1], &c__1, (ftnlen)12);
i__1 = imax - k;
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
;
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
jmax + (k + 1) * w_dim1]), abs(d__2));
if (imax < *n) {
i__1 = *n - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
/* Computing MAX */
i__1 = jmax + (k + 1) * w_dim1;
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs(
d__2));
rowmax = max(d__3,d__4);
}
if (absakk >= alpha * colmax * (colmax / rowmax)) {
kp = k;
} else /* if(complicated condition) */ {
i__1 = imax + (k + 1) * w_dim1;
if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
imax + (k + 1) * w_dim1]), abs(d__2)) >= alpha *
rowmax) {
kp = imax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
k * w_dim1], &c__1);
} else {
kp = imax;
kstep = 2;
}
}
}
kk = k + kstep - 1;
if (kp != kk) {
i__1 = kp + kp * a_dim1;
i__2 = kk + kk * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kp - kk - 1;
zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
1) * a_dim1], lda);
if (kp < *n) {
i__1 = *n - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ kp * a_dim1], &c__1);
}
if (k > 1) {
i__1 = k - 1;
zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
}
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
r1.r = z__1.r, r1.i = z__1.i;
i__1 = *n - k;
zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k + k * w_dim1], &d21);
d22.r = z__1.r, d22.i = z__1.i;
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
d22.i + d11.i * d22.r;
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
z_div(&z__1, &c_b1, &z__2);
t.r = z__1.r, t.i = z__1.i;
z_div(&z__1, &t, &d21);
d21.r = z__1.r, d21.i = z__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
.i;
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
d21.r * z__2.i + d21.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -kp;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
jj = j;
jp = ipiv[j];
if (jp < 0) {
jp = -jp;
--j;
}
--j;
if (jp != jj && j >= 1) {
zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j > 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

236
relapack/src/zsytrf_rook.c Normal file
View File

@ -0,0 +1,236 @@
#include "relapack.h"
#if XSYTRF_ALLOW_MALLOC
#include <stdlib.h>
#endif
static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *,
double *, const int *, int *, double *, const int *, int *);
/** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
*
* This routine is functionally equivalent to LAPACK's zsytrf_rook.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html
* */
void RELAPACK_zsytrf_rook(
const char *uplo, const int *n,
double *A, const int *ldA, int *ipiv,
double *Work, const int *lWork, int *info
) {
// Required work size
const int cleanlWork = *n * (*n / 2);
int minlWork = cleanlWork;
#if XSYTRF_ALLOW_MALLOC
minlWork = 1;
#endif
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
*info = -4;
else if (*lWork < minlWork && *lWork != -1)
*info = -7;
else if (*lWork == -1) {
// Work size query
*Work = cleanlWork;
return;
}
// Ensure Work size
double *cleanWork = Work;
#if XSYTRF_ALLOW_MALLOC
if (!*info && *lWork < cleanlWork) {
cleanWork = malloc(cleanlWork * 2 * sizeof(double));
if (!cleanWork)
*info = -7;
}
#endif
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZSYTRF", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
// Dummy argument
int nout;
// Recursive kernel
RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
#if XSYTRF_ALLOW_MALLOC
if (cleanWork != Work)
free(cleanWork);
#endif
}
/** zsytrf_rook's recursive compute kernel */
static void RELAPACK_zsytrf_rook_rec(
const char *uplo, const int *n_full, const int *n, int *n_out,
double *A, const int *ldA, int *ipiv,
double *Work, const int *ldWork, int *info
) {
// top recursion level?
const int top = *n_full == *n;
if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) {
// Unblocked
if (top) {
LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info);
*n_out = *n;
} else
RELAPACK_zsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
return;
}
int info1, info2;
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
const int n_rest = *n_full - *n;
if (*uplo == 'L') {
// Splitting (setup)
int n1 = ZREC_SPLIT(*n);
int n2 = *n - n1;
// Work_L *
double *const Work_L = Work;
// recursion(A_L)
int n1_out;
RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
n1 = n1_out;
// Splitting (continued)
n2 = *n - n1;
const int n_full2 = *n_full - n1;
// * *
// A_BL A_BR
// A_BL_B A_BR_B
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
double *const A_BL_B = A + 2 * *n;
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
// * *
// Work_BL Work_BR
// * *
// (top recursion level: use Work as Work_BR)
double *const Work_BL = Work + 2 * n1;
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
const int ldWork_BR = top ? n2 : *ldWork;
// ipiv_T
// ipiv_B
int *const ipiv_B = ipiv + n1;
// A_BR = A_BR - A_BL Work_BL'
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
// recursion(A_BR)
int n2_out;
RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
if (n2_out != n2) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// last column of A_BR
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
// last row of A_BL
double *const A_BL_b = A_BL + 2 * n2_out;
// last row of Work_BL
double *const Work_BL_b = Work_BL + 2 * n2_out;
// A_BR_r = A_BR_r + A_BL_b Work_BL_b'
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
}
n2 = n2_out;
// shift pivots
int i;
for (i = 0; i < n2; i++)
if (ipiv_B[i] > 0)
ipiv_B[i] += n1;
else
ipiv_B[i] -= n1;
*info = info1 || info2;
*n_out = n1 + n2;
} else {
// Splitting (setup)
int n2 = ZREC_SPLIT(*n);
int n1 = *n - n2;
// * Work_R
// (top recursion level: use Work as Work_R)
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
// recursion(A_R)
int n2_out;
RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
const int n2_diff = n2 - n2_out;
n2 = n2_out;
// Splitting (continued)
n1 = *n - n2;
const int n_full1 = *n_full - n2;
// * A_TL_T A_TR_T
// * A_TL A_TR
// * * *
double *const A_TL_T = A + 2 * *ldA * n_rest;
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
// Work_L *
// * Work_TR
// * *
// (top recursion level: Work_R was Work)
double *const Work_L = Work;
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
const int ldWork_L = top ? n1 : *ldWork;
// A_TL = A_TL - A_TR Work_TR'
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
// recursion(A_TL)
int n1_out;
RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
if (n1_out != n1) {
// undo 1 column of updates
const int n_restp1 = n_rest + 1;
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
}
n1 = n1_out;
*info = info2 || info1;
*n_out = n1 + n2;
}
}

View File

@ -0,0 +1,561 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "f2c.h"
/* Table of constant values */
static doublecomplex c_b1 = {1.,0.};
static int c__1 = 1;
/** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
*
* This routine is a minor modification of LAPACK's zlasyf_rook.
* It serves as an unblocked kernel in the recursive algorithms.
* The blocked BLAS Level 3 updates were removed and moved to the
* recursive algorithm.
* */
/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n,
int *nb, int *kb, doublecomplex *a, int *lda, int *
ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
{
/* System generated locals */
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
double d__1, d__2;
doublecomplex z__1, z__2, z__3, z__4;
/* Builtin functions */
double sqrt(double), d_imag(doublecomplex *);
void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
/* Local variables */
static int j, k, p;
static doublecomplex t, r1, d11, d12, d21, d22;
static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
static logical done;
static int imax, jmax;
static double alpha;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
static double dtemp, sfmin;
extern /* Subroutine */ int zscal_(int *, doublecomplex *,
doublecomplex *, int *);
static int itemp, kstep;
extern /* Subroutine */ int zgemv_(char *, int *, int *,
doublecomplex *, doublecomplex *, int *, doublecomplex *,
int *, doublecomplex *, doublecomplex *, int *, ftnlen),
zcopy_(int *, doublecomplex *, int *, doublecomplex *,
int *), zswap_(int *, doublecomplex *, int *,
doublecomplex *, int *);
extern double dlamch_(char *, ftnlen);
static double absakk, colmax;
extern int izamax_(int *, doublecomplex *, int *);
static double rowmax;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--ipiv;
w_dim1 = *ldw;
w_offset = 1 + w_dim1;
w -= w_offset;
/* Function Body */
*info = 0;
alpha = (sqrt(17.) + 1.) / 8.;
sfmin = dlamch_("S", (ftnlen)1);
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
k = *n;
L10:
kw = *nb + k - *n;
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
goto L30;
}
kstep = 1;
p = k;
zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
w_dim1 + 1], &c__1, (ftnlen)12);
}
i__1 = k + kw * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw *
w_dim1]), abs(d__2));
if (k > 1) {
i__1 = k - 1;
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
i__1 = imax + kw * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
kw * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L12:
zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
w_dim1 + 1], &c__1);
i__1 = k - imax;
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
1 + (kw - 1) * w_dim1], &c__1);
if (k < *n) {
i__1 = *n - k;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
ftnlen)12);
}
if (imax != k) {
i__1 = k - imax;
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) *
w_dim1], &c__1);
i__1 = jmax + (kw - 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
w[jmax + (kw - 1) * w_dim1]), abs(d__2));
} else {
rowmax = 0.;
}
if (imax > 1) {
i__1 = imax - 1;
itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
i__1 = itemp + (kw - 1) * w_dim1;
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
itemp + (kw - 1) * w_dim1]), abs(d__2));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
i__1 = imax + (kw - 1) * w_dim1;
if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax
+ (kw - 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) {
kp = imax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
w_dim1 + 1], &c__1);
}
if (! done) {
goto L12;
}
}
kk = k - kstep + 1;
kkw = *nb + kk - *n;
if (kstep == 2 && p != k) {
i__1 = k - p;
zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
a_dim1], lda);
zcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
c__1);
i__1 = *n - k + 1;
zswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
lda);
i__1 = *n - kk + 1;
zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
ldw);
}
if (kp != kk) {
i__1 = kp + k * a_dim1;
i__2 = kk + k * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = k - 1 - kp;
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
1) * a_dim1], lda);
zcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
c__1);
i__1 = *n - kk + 1;
zswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
lda);
i__1 = *n - kk + 1;
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
w_dim1], ldw);
}
if (kstep == 1) {
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
c__1);
if (k > 1) {
i__1 = k + k * a_dim1;
if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k +
k * a_dim1]), abs(d__2)) >= sfmin) {
z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
r1.r = z__1.r, r1.i = z__1.i;
i__1 = k - 1;
zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
} else /* if(complicated condition) */ {
i__1 = k + k * a_dim1;
if (a[i__1].r != 0. || a[i__1].i != 0.) {
i__1 = k - 1;
for (ii = 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
z_div(&z__1, &a[ii + k * a_dim1], &a[k + k *
a_dim1]);
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L14: */
}
}
}
}
} else {
if (k > 2) {
i__1 = k - 1 + kw * w_dim1;
d12.r = w[i__1].r, d12.i = w[i__1].i;
z_div(&z__1, &w[k + kw * w_dim1], &d12);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
d22.r = z__1.r, d22.i = z__1.i;
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
d22.i + d11.i * d22.r;
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
z_div(&z__1, &c_b1, &z__2);
t.r = z__1.r, t.i = z__1.i;
i__1 = k - 2;
for (j = 1; j <= i__1; ++j) {
i__2 = j + (k - 1) * a_dim1;
i__3 = j + (kw - 1) * w_dim1;
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + kw * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d12);
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
z__2.i + t.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + k * a_dim1;
i__3 = j + kw * w_dim1;
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + (kw - 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d12);
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
z__2.i + t.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L20: */
}
}
i__1 = k - 1 + (k - 1) * a_dim1;
i__2 = k - 1 + (kw - 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k - 1 + k * a_dim1;
i__2 = k - 1 + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + k * a_dim1;
i__2 = k + kw * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k - 1] = -kp;
}
k -= kstep;
goto L10;
L30:
j = k + 1;
L60:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
++j;
jp1 = -ipiv[j];
kstep = 2;
}
++j;
if (jp2 != jj && j <= *n) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
jj = j - 1;
if (jp1 != jj && kstep == 2) {
i__1 = *n - j + 1;
zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
;
}
if (j <= *n) {
goto L60;
}
*kb = *n - k;
} else {
k = 1;
L70:
if ((k >= *nb && *nb < *n) || k > *n) {
goto L90;
}
kstep = 1;
p = k;
i__1 = *n - k + 1;
zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
ftnlen)12);
}
i__1 = k + k * w_dim1;
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k *
w_dim1]), abs(d__2));
if (k < *n) {
i__1 = *n - k;
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
i__1 = imax + k * w_dim1;
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
k * w_dim1]), abs(d__2));
} else {
colmax = 0.;
}
if (max(absakk,colmax) == 0.) {
if (*info == 0) {
*info = k;
}
kp = k;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
} else {
if (! (absakk < alpha * colmax)) {
kp = k;
} else {
done = FALSE_;
L72:
i__1 = imax - k;
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
w_dim1], &c__1);
i__1 = *n - imax + 1;
zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
1) * w_dim1], &c__1);
if (k > 1) {
i__1 = *n - k + 1;
i__2 = k - 1;
z__1.r = -1., z__1.i = -0.;
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1]
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
1) * w_dim1], &c__1, (ftnlen)12);
}
if (imax != k) {
i__1 = imax - k;
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &
c__1);
i__1 = jmax + (k + 1) * w_dim1;
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
w[jmax + (k + 1) * w_dim1]), abs(d__2));
} else {
rowmax = 0.;
}
if (imax < *n) {
i__1 = *n - imax;
itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
w_dim1], &c__1);
i__1 = itemp + (k + 1) * w_dim1;
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
itemp + (k + 1) * w_dim1]), abs(d__2));
if (dtemp > rowmax) {
rowmax = dtemp;
jmax = itemp;
}
}
i__1 = imax + (k + 1) * w_dim1;
if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax
+ (k + 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) {
kp = imax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
done = TRUE_;
} else if (p == jmax || rowmax <= colmax) {
kp = imax;
kstep = 2;
done = TRUE_;
} else {
p = imax;
colmax = rowmax;
imax = jmax;
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
w_dim1], &c__1);
}
if (! done) {
goto L72;
}
}
kk = k + kstep - 1;
if (kstep == 2 && p != k) {
i__1 = p - k;
zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
lda);
i__1 = *n - p + 1;
zcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
c__1);
zswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
}
if (kp != kk) {
i__1 = kp + k * a_dim1;
i__2 = kk + k * a_dim1;
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
i__1 = kp - k - 1;
zcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
* a_dim1], lda);
i__1 = *n - kp + 1;
zcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
a_dim1], &c__1);
zswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
}
if (kstep == 1) {
i__1 = *n - k + 1;
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
c__1);
if (k < *n) {
i__1 = k + k * a_dim1;
if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k +
k * a_dim1]), abs(d__2)) >= sfmin) {
z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
r1.r = z__1.r, r1.i = z__1.i;
i__1 = *n - k;
zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
} else /* if(complicated condition) */ {
i__1 = k + k * a_dim1;
if (a[i__1].r != 0. || a[i__1].i != 0.) {
i__1 = *n;
for (ii = k + 1; ii <= i__1; ++ii) {
i__2 = ii + k * a_dim1;
z_div(&z__1, &a[ii + k * a_dim1], &a[k + k *
a_dim1]);
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L74: */
}
}
}
}
} else {
if (k < *n - 1) {
i__1 = k + 1 + k * w_dim1;
d21.r = w[i__1].r, d21.i = w[i__1].i;
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
d11.r = z__1.r, d11.i = z__1.i;
z_div(&z__1, &w[k + k * w_dim1], &d21);
d22.r = z__1.r, d22.i = z__1.i;
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
d22.i + d11.i * d22.r;
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
z_div(&z__1, &c_b1, &z__2);
t.r = z__1.r, t.i = z__1.i;
i__1 = *n;
for (j = k + 2; j <= i__1; ++j) {
i__2 = j + k * a_dim1;
i__3 = j + k * w_dim1;
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
.r;
i__4 = j + (k + 1) * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d21);
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
z__2.i + t.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
i__2 = j + (k + 1) * a_dim1;
i__3 = j + (k + 1) * w_dim1;
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
.r;
i__4 = j + k * w_dim1;
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
.i;
z_div(&z__2, &z__3, &d21);
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
z__2.i + t.i * z__2.r;
a[i__2].r = z__1.r, a[i__2].i = z__1.i;
/* L80: */
}
}
i__1 = k + k * a_dim1;
i__2 = k + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + k * a_dim1;
i__2 = k + 1 + k * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
i__1 = k + 1 + (k + 1) * a_dim1;
i__2 = k + 1 + (k + 1) * w_dim1;
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
}
}
if (kstep == 1) {
ipiv[k] = kp;
} else {
ipiv[k] = -p;
ipiv[k + 1] = -kp;
}
k += kstep;
goto L70;
L90:
j = k - 1;
L120:
kstep = 1;
jp1 = 1;
jj = j;
jp2 = ipiv[j];
if (jp2 < 0) {
jp2 = -jp2;
--j;
jp1 = -ipiv[j];
kstep = 2;
}
--j;
if (jp2 != jj && j >= 1) {
zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
}
jj = j + 1;
if (jp1 != jj && kstep == 2) {
zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
}
if (j >= 1) {
goto L120;
}
*kb = k - 1;
}
return;
}

268
relapack/src/ztgsyl.c Normal file
View File

@ -0,0 +1,268 @@
#include "relapack.h"
#include <math.h>
static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *,
const int *, const double *, const int *, const double *, const int *,
double *, const int *, const double *, const int *, const double *,
const int *, double *, const int *, double *, double *, double *, int *);
/** ZTGSYL solves the generalized Sylvester equation.
*
* This routine is functionally equivalent to LAPACK's ztgsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html
* */
void RELAPACK_ztgsyl(
const char *trans, const int *ijob, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dif,
double *Work, const int *lWork, int *iWork, int *info
) {
// Parse arguments
const int notran = LAPACK(lsame)(trans, "N");
const int tran = LAPACK(lsame)(trans, "C");
// Compute work buffer size
int lwmin = 1;
if (notran && (*ijob == 1 || *ijob == 2))
lwmin = MAX(1, 2 * *m * *n);
*info = 0;
// Check arguments
if (!tran && !notran)
*info = -1;
else if (notran && (*ijob < 0 || *ijob > 4))
*info = -2;
else if (*m <= 0)
*info = -3;
else if (*n <= 0)
*info = -4;
else if (*ldA < MAX(1, *m))
*info = -6;
else if (*ldB < MAX(1, *n))
*info = -8;
else if (*ldC < MAX(1, *m))
*info = -10;
else if (*ldD < MAX(1, *m))
*info = -12;
else if (*ldE < MAX(1, *n))
*info = -14;
else if (*ldF < MAX(1, *m))
*info = -16;
else if (*lWork < lwmin && *lWork != -1)
*info = -20;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZTGSYL", &minfo);
return;
}
if (*lWork == -1) {
// Work size query
*Work = lwmin;
return;
}
// Clean char * arguments
const char cleantrans = notran ? 'N' : 'C';
// Constant
const double ZERO[] = { 0., 0. };
int isolve = 1;
int ifunc = 0;
if (notran) {
if (*ijob >= 3) {
ifunc = *ijob - 2;
LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF);
} else if (*ijob >= 1)
isolve = 2;
}
double scale2;
int iround;
for (iround = 1; iround <= isolve; iround++) {
*scale = 1;
double dscale = 0;
double dsum = 1;
RELAPACK_ztgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
if (dscale != 0) {
if (*ijob == 1 || *ijob == 3)
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
else
*dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
}
if (isolve == 2) {
if (iround == 1) {
if (notran)
ifunc = *ijob;
scale2 = *scale;
LAPACK(zlacpy)("F", m, n, C, ldC, Work, m);
LAPACK(zlacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC);
LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF);
} else {
LAPACK(zlacpy)("F", m, n, Work, m, C, ldC);
LAPACK(zlacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
*scale = scale2;
}
}
}
}
/** ztgsyl's recursive vompute kernel */
static void RELAPACK_ztgsyl_rec(
const char *trans, const int *ifunc, const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC,
const double *D, const int *ldD, const double *E, const int *ldE,
double *F, const int *ldF,
double *scale, double *dsum, double *dscale,
int *info
) {
if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) {
// Unblocked
LAPACK(ztgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const int iONE[] = { 1 };
// Outputs
double scale1[] = { 1., 0. };
double scale2[] = { 1., 0. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
const int m1 = ZREC_SPLIT(*m);
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const double *const A_TL = A;
const double *const A_TR = A + 2 * *ldA * m1;
const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
// C_T
// C_B
double *const C_T = C;
double *const C_B = C + 2 * m1;
// D_TL D_TR
// 0 D_BR
const double *const D_TL = D;
const double *const D_TR = D + 2 * *ldD * m1;
const double *const D_BR = D + 2 * *ldD * m1 + 2 * m1;
// F_T
// F_B
double *const F_T = F;
double *const F_B = F + 2 * m1;
if (*trans == 'N') {
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
// C_T = C_T - A_TR * C_B
BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// F_T = F_T - D_TR * C_B
BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
}
} else {
// recursion(A_TL, B, C_T, D_TL, E, F_T)
RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
// C_B = C_B - A_TR^H * C_T
BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// C_B = C_B - D_TR^H * F_T
BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
// recursion(A_BR, B, C_B, D_BR, E, F_B)
RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
}
}
} else {
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const double *const B_TL = B;
const double *const B_TR = B + 2 * *ldB * n1;
const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// C_L C_R
double *const C_L = C;
double *const C_R = C + 2 * *ldC * n1;
// E_TL E_TR
// 0 E_BR
const double *const E_TL = E;
const double *const E_TR = E + 2 * *ldE * n1;
const double *const E_BR = E + 2 * *ldE * n1 + 2 * n1;
// F_L F_R
double *const F_L = F;
double *const F_R = F + 2 * *ldF * n1;
if (*trans == 'N') {
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
// C_R = C_R + F_L * B_TR
BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
// F_R = F_R + F_L * E_TR
BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
}
} else {
// recursion(A, B_BR, C_R, D, E_BR, F_R)
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
// apply scale
if (scale1[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
// F_L = F_L + C_R * B_TR
BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
// F_L = F_L + F_R * E_TR
BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
// recursion(A, B_TL, C_L, D, E_TL, F_L)
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
// apply scale
if (scale2[0] != 1) {
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
}
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

163
relapack/src/ztrsyl.c Normal file
View File

@ -0,0 +1,163 @@
#include "relapack.h"
static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *,
const int *, const int *, const double *, const int *, const double *,
const int *, double *, const int *, double *, int *);
/** ZTRSYL solves the complex Sylvester matrix equation.
*
* This routine is functionally equivalent to LAPACK's ztrsyl.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html
* */
void RELAPACK_ztrsyl(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
// Check arguments
const int notransA = LAPACK(lsame)(tranA, "N");
const int ctransA = LAPACK(lsame)(tranA, "C");
const int notransB = LAPACK(lsame)(tranB, "N");
const int ctransB = LAPACK(lsame)(tranB, "C");
*info = 0;
if (!ctransA && !notransA)
*info = -1;
else if (!ctransB && !notransB)
*info = -2;
else if (*isgn != 1 && *isgn != -1)
*info = -3;
else if (*m < 0)
*info = -4;
else if (*n < 0)
*info = -5;
else if (*ldA < MAX(1, *m))
*info = -7;
else if (*ldB < MAX(1, *n))
*info = -9;
else if (*ldC < MAX(1, *m))
*info = -11;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZTRSYL", &minfo);
return;
}
// Clean char * arguments
const char cleantranA = notransA ? 'N' : 'C';
const char cleantranB = notransB ? 'N' : 'C';
// Recursive kernel
RELAPACK_ztrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
}
/** ztrsyl's recursive compute kernel */
static void RELAPACK_ztrsyl_rec(
const char *tranA, const char *tranB, const int *isgn,
const int *m, const int *n,
const double *A, const int *ldA, const double *B, const int *ldB,
double *C, const int *ldC, double *scale,
int *info
) {
if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) {
// Unblocked
RELAPACK_ztrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
return;
}
// Constants
const double ONE[] = { 1., 0. };
const double MONE[] = { -1., 0. };
const double MSGN[] = { -*isgn, 0. };
const int iONE[] = { 1 };
// Outputs
double scale1[] = { 1., 0. };
double scale2[] = { 1., 0. };
int info1[] = { 0 };
int info2[] = { 0 };
if (*m > *n) {
// Splitting
const int m1 = ZREC_SPLIT(*m);
const int m2 = *m - m1;
// A_TL A_TR
// 0 A_BR
const double *const A_TL = A;
const double *const A_TR = A + 2 * *ldA * m1;
const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
// C_T
// C_B
double *const C_T = C;
double *const C_B = C + 2 * m1;
if (*tranA == 'N') {
// recusion(A_BR, B, C_B)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
// C_T = C_T - A_TR * C_B
BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
// recusion(A_TL, B, C_T)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
} else {
// recusion(A_TL, B, C_T)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
// C_B = C_B - A_TR' * C_T
BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
// recusion(A_BR, B, C_B)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
}
} else {
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// B_TL B_TR
// 0 B_BR
const double *const B_TL = B;
const double *const B_TR = B + 2 * *ldB * n1;
const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
// C_L C_R
double *const C_L = C;
double *const C_R = C + 2 * *ldC * n1;
if (*tranB == 'N') {
// recusion(A, B_TL, C_L)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
// C_R = C_R -/+ C_L * B_TR
BLAS(zgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
// recusion(A, B_BR, C_R)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
} else {
// recusion(A, B_BR, C_R)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
// C_L = C_L -/+ C_R * B_TR'
BLAS(zgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
// recusion(A, B_TL, C_L)
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
// apply scale
if (scale2[0] != 1)
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
}
}
*scale = scale1[0] * scale2[0];
*info = info1[0] || info2[0];
}

394
relapack/src/ztrsyl_rec2.c Normal file
View File

@ -0,0 +1,394 @@
/* -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "../config.h"
#include "f2c.h"
#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
doublecomplex result;
zdotu_(&result, n, x, incx, y, incy);
return result;
}
#define zdotu_ zdotu_fun
doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
doublecomplex result;
zdotc_(&result, n, x, incx, y, incy);
return result;
}
#define zdotc_ zdotc_fun
#endif
#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) {
extern void zladiv_(doublecomplex *, doublecomplex *, doublecomplex *);
doublecomplex result;
zladiv_(&result, a, b);
return result;
}
#define zladiv_ zladiv_fun
#endif
/* Table of constant values */
static int c__1 = 1;
/** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
*
* This routine is an exact copy of LAPACK's ztrsyl.
* It serves as an unblocked kernel in the recursive algorithms.
* */
/* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int
*isgn, int *m, int *n, doublecomplex *a, int *lda,
doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc,
double *scale, int *info, ftnlen trana_len, ftnlen tranb_len)
{
/* System generated locals */
int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
i__3, i__4;
double d__1, d__2;
doublecomplex z__1, z__2, z__3, z__4;
/* Builtin functions */
double d_imag(doublecomplex *);
void d_cnjg(doublecomplex *, doublecomplex *);
/* Local variables */
static int j, k, l;
static doublecomplex a11;
static double db;
static doublecomplex x11;
static double da11;
static doublecomplex vec;
static double dum[1], eps, sgn, smin;
static doublecomplex suml, sumr;
extern int lsame_(char *, char *, ftnlen, ftnlen);
/* Double Complex */ doublecomplex zdotc_(int *,
doublecomplex *, int *, doublecomplex *, int *), zdotu_(
int *, doublecomplex *, int *,
doublecomplex *, int *);
extern /* Subroutine */ int dlabad_(double *, double *);
extern double dlamch_(char *, ftnlen);
static double scaloc;
extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
extern double zlange_(char *, int *, int *, doublecomplex *,
int *, double *, ftnlen);
static double bignum;
extern /* Subroutine */ int zdscal_(int *, double *,
doublecomplex *, int *);
/* Double Complex */ doublecomplex zladiv_(doublecomplex *,
doublecomplex *);
static int notrna, notrnb;
static double smlnum;
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
b_dim1 = *ldb;
b_offset = 1 + b_dim1;
b -= b_offset;
c_dim1 = *ldc;
c_offset = 1 + c_dim1;
c__ -= c_offset;
/* Function Body */
notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
*info = 0;
if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
*info = -1;
} else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
*info = -2;
} else if (*isgn != 1 && *isgn != -1) {
*info = -3;
} else if (*m < 0) {
*info = -4;
} else if (*n < 0) {
*info = -5;
} else if (*lda < max(1,*m)) {
*info = -7;
} else if (*ldb < max(1,*n)) {
*info = -9;
} else if (*ldc < max(1,*m)) {
*info = -11;
}
if (*info != 0) {
i__1 = -(*info);
xerbla_("ZTRSY2", &i__1, (ftnlen)6);
return;
}
*scale = 1.;
if (*m == 0 || *n == 0) {
return;
}
eps = dlamch_("P", (ftnlen)1);
smlnum = dlamch_("S", (ftnlen)1);
bignum = 1. / smlnum;
dlabad_(&smlnum, &bignum);
smlnum = smlnum * (double) (*m * *n) / eps;
bignum = 1. / smlnum;
/* Computing MAX */
d__1 = smlnum, d__2 = eps * zlange_("M", m, m, &a[a_offset], lda, dum, (
ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * zlange_("M", n, n,
&b[b_offset], ldb, dum, (ftnlen)1);
smin = max(d__1,d__2);
sgn = (double) (*isgn);
if (notrna && notrnb) {
i__1 = *n;
for (l = 1; l <= i__1; ++l) {
for (k = *m; k >= 1; --k) {
i__2 = *m - k;
/* Computing MIN */
i__3 = k + 1;
/* Computing MIN */
i__4 = k + 1;
z__1 = zdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
min(i__4,*m) + l * c_dim1], &c__1);
suml.r = z__1.r, suml.i = z__1.i;
i__2 = l - 1;
z__1 = zdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
, &c__1);
sumr.r = z__1.r, sumr.i = z__1.i;
i__2 = k + l * c_dim1;
z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i;
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i;
vec.r = z__1.r, vec.i = z__1.i;
scaloc = 1.;
i__2 = k + k * a_dim1;
i__3 = l + l * b_dim1;
z__2.r = sgn * b[i__3].r, z__2.i = sgn * b[i__3].i;
z__1.r = a[i__2].r + z__2.r, z__1.i = a[i__2].i + z__2.i;
a11.r = z__1.r, a11.i = z__1.i;
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
d__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.;
da11 = smin;
*info = 1;
}
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
d__2));
if (da11 < 1. && db > 1.) {
if (db > bignum * da11) {
scaloc = 1. / db;
}
}
z__3.r = scaloc, z__3.i = 0.;
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
z__3.i + vec.i * z__3.r;
z__1 = zladiv_(&z__2, &a11);
x11.r = z__1.r, x11.i = z__1.i;
if (scaloc != 1.) {
i__2 = *n;
for (j = 1; j <= i__2; ++j) {
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L10: */
}
*scale *= scaloc;
}
i__2 = k + l * c_dim1;
c__[i__2].r = x11.r, c__[i__2].i = x11.i;
/* L20: */
}
/* L30: */
}
} else if (! notrna && notrnb) {
i__1 = *n;
for (l = 1; l <= i__1; ++l) {
i__2 = *m;
for (k = 1; k <= i__2; ++k) {
i__3 = k - 1;
z__1 = zdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
c_dim1 + 1], &c__1);
suml.r = z__1.r, suml.i = z__1.i;
i__3 = l - 1;
z__1 = zdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
, &c__1);
sumr.r = z__1.r, sumr.i = z__1.i;
i__3 = k + l * c_dim1;
z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i;
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
z__1.r = c__[i__3].r - z__2.r, z__1.i = c__[i__3].i - z__2.i;
vec.r = z__1.r, vec.i = z__1.i;
scaloc = 1.;
d_cnjg(&z__2, &a[k + k * a_dim1]);
i__3 = l + l * b_dim1;
z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i;
z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
a11.r = z__1.r, a11.i = z__1.i;
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
d__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.;
da11 = smin;
*info = 1;
}
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
d__2));
if (da11 < 1. && db > 1.) {
if (db > bignum * da11) {
scaloc = 1. / db;
}
}
z__3.r = scaloc, z__3.i = 0.;
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
z__3.i + vec.i * z__3.r;
z__1 = zladiv_(&z__2, &a11);
x11.r = z__1.r, x11.i = z__1.i;
if (scaloc != 1.) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L40: */
}
*scale *= scaloc;
}
i__3 = k + l * c_dim1;
c__[i__3].r = x11.r, c__[i__3].i = x11.i;
/* L50: */
}
/* L60: */
}
} else if (! notrna && ! notrnb) {
for (l = *n; l >= 1; --l) {
i__1 = *m;
for (k = 1; k <= i__1; ++k) {
i__2 = k - 1;
z__1 = zdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
c_dim1 + 1], &c__1);
suml.r = z__1.r, suml.i = z__1.i;
i__2 = *n - l;
/* Computing MIN */
i__3 = l + 1;
/* Computing MIN */
i__4 = l + 1;
z__1 = zdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
l + min(i__4,*n) * b_dim1], ldb);
sumr.r = z__1.r, sumr.i = z__1.i;
i__2 = k + l * c_dim1;
d_cnjg(&z__4, &sumr);
z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i;
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i;
vec.r = z__1.r, vec.i = z__1.i;
scaloc = 1.;
i__2 = k + k * a_dim1;
i__3 = l + l * b_dim1;
z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i;
z__2.r = a[i__2].r + z__3.r, z__2.i = a[i__2].i + z__3.i;
d_cnjg(&z__1, &z__2);
a11.r = z__1.r, a11.i = z__1.i;
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
d__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.;
da11 = smin;
*info = 1;
}
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
d__2));
if (da11 < 1. && db > 1.) {
if (db > bignum * da11) {
scaloc = 1. / db;
}
}
z__3.r = scaloc, z__3.i = 0.;
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
z__3.i + vec.i * z__3.r;
z__1 = zladiv_(&z__2, &a11);
x11.r = z__1.r, x11.i = z__1.i;
if (scaloc != 1.) {
i__2 = *n;
for (j = 1; j <= i__2; ++j) {
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L70: */
}
*scale *= scaloc;
}
i__2 = k + l * c_dim1;
c__[i__2].r = x11.r, c__[i__2].i = x11.i;
/* L80: */
}
/* L90: */
}
} else if (notrna && ! notrnb) {
for (l = *n; l >= 1; --l) {
for (k = *m; k >= 1; --k) {
i__1 = *m - k;
/* Computing MIN */
i__2 = k + 1;
/* Computing MIN */
i__3 = k + 1;
z__1 = zdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
min(i__3,*m) + l * c_dim1], &c__1);
suml.r = z__1.r, suml.i = z__1.i;
i__1 = *n - l;
/* Computing MIN */
i__2 = l + 1;
/* Computing MIN */
i__3 = l + 1;
z__1 = zdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
l + min(i__3,*n) * b_dim1], ldb);
sumr.r = z__1.r, sumr.i = z__1.i;
i__1 = k + l * c_dim1;
d_cnjg(&z__4, &sumr);
z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i;
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
z__1.r = c__[i__1].r - z__2.r, z__1.i = c__[i__1].i - z__2.i;
vec.r = z__1.r, vec.i = z__1.i;
scaloc = 1.;
i__1 = k + k * a_dim1;
d_cnjg(&z__3, &b[l + l * b_dim1]);
z__2.r = sgn * z__3.r, z__2.i = sgn * z__3.i;
z__1.r = a[i__1].r + z__2.r, z__1.i = a[i__1].i + z__2.i;
a11.r = z__1.r, a11.i = z__1.i;
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
d__2));
if (da11 <= smin) {
a11.r = smin, a11.i = 0.;
da11 = smin;
*info = 1;
}
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
d__2));
if (da11 < 1. && db > 1.) {
if (db > bignum * da11) {
scaloc = 1. / db;
}
}
z__3.r = scaloc, z__3.i = 0.;
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
z__3.i + vec.i * z__3.r;
z__1 = zladiv_(&z__2, &a11);
x11.r = z__1.r, x11.i = z__1.i;
if (scaloc != 1.) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
/* L100: */
}
*scale *= scaloc;
}
i__1 = k + l * c_dim1;
c__[i__1].r = x11.r, c__[i__1].i = x11.i;
/* L110: */
}
/* L120: */
}
}
return;
}

107
relapack/src/ztrtri.c Normal file
View File

@ -0,0 +1,107 @@
#include "relapack.h"
static void RELAPACK_ztrtri_rec(const char *, const char *, const int *,
double *, const int *, int *);
/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
*
* This routine is functionally equivalent to LAPACK's ztrtri.
* For details on its interface, see
* http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html
* */
void RELAPACK_ztrtri(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
) {
// Check arguments
const int lower = LAPACK(lsame)(uplo, "L");
const int upper = LAPACK(lsame)(uplo, "U");
const int nounit = LAPACK(lsame)(diag, "N");
const int unit = LAPACK(lsame)(diag, "U");
*info = 0;
if (!lower && !upper)
*info = -1;
else if (!nounit && !unit)
*info = -2;
else if (*n < 0)
*info = -3;
else if (*ldA < MAX(1, *n))
*info = -5;
if (*info) {
const int minfo = -*info;
LAPACK(xerbla)("ZTRTRI", &minfo);
return;
}
// Clean char * arguments
const char cleanuplo = lower ? 'L' : 'U';
const char cleandiag = nounit ? 'N' : 'U';
// check for singularity
if (nounit) {
int i;
for (i = 0; i < *n; i++)
if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
*info = i;
return;
}
}
// Recursive kernel
RELAPACK_ztrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
}
/** ztrtri's recursive compute kernel */
static void RELAPACK_ztrtri_rec(
const char *uplo, const char *diag, const int *n,
double *A, const int *ldA,
int *info
){
if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) {
// Unblocked
LAPACK(ztrti2)(uplo, diag, n, A, ldA, info);
return;
}
// Constants
const double ONE[] = { 1. };
const double MONE[] = { -1. };
// Splitting
const int n1 = ZREC_SPLIT(*n);
const int n2 = *n - n1;
// A_TL A_TR
// A_BL A_BR
double *const A_TL = A;
double *const A_TR = A + 2 * *ldA * n1;
double *const A_BL = A + 2 * n1;
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
// recursion(A_TL)
RELAPACK_ztrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
if (*info)
return;
if (*uplo == 'L') {
// A_BL = - A_BL * A_TL
BLAS(ztrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
// A_BL = A_BR \ A_BL
BLAS(ztrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
} else {
// A_TR = - A_TL * A_TR
BLAS(ztrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
// A_TR = A_TR / A_BR
BLAS(ztrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
}
// recursion(A_BR)
RELAPACK_ztrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
if (*info)
*info += n1;
}