Merge branch 'xianyi:develop' into issue4128

2023-07-14 13:33:52 +02:00 · 2023-07-14 13:33:52 +02:00 · a6e41900fd
parent 0f015d63f2 49077e7bde
commit a6e41900fd
41 changed files with 2143 additions and 141 deletions
--- a/Makefile.arm64
+++ b/Makefile.arm64
@ -69,7 +69,7 @@ endif
 # in GCC>=9
 ifeq ($(CORE), NEOVERSEN1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
@ -92,9 +92,14 @@ endif
 # in GCC>=10.4
 ifeq ($(CORE), NEOVERSEV1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ10), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
-CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
+CCOMMON_OPT += -march=armv8.4-a+sve
 ifeq (1, $(ISCLANG))
 CCOMMON_OPT += -mtune=cortex-x1
 else
 CCOMMON_OPT += -mtune=neoverse-v1
 endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
 endif
@ -122,8 +127,8 @@ endif
 # in GCC>=10.4
 ifeq ($(CORE), NEOVERSEN2)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ10), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
 ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 else
@ -155,7 +160,7 @@ endif
 # Use a53 tunings because a55 is only available in GCC>=8.1
 ifeq ($(CORE), CORTEXA55)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ8), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
@ -196,8 +201,13 @@ endif
 endif
 ifeq ($(CORE), THUNDERX3T110)
-ifeq ($(GCCVERSIONGTEQ10), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
-CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
+CCOMMON_OPT += -march=armv8.3-a 
 ifeq (0, $(ISCLANG))
 CCOMMON_OPT += -mtune=thunderx3t110
 else
 CCOMMON_OPT += -mtune=thunderx2t99
 endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
 endif
@ -225,9 +235,12 @@ endif
 endif
 endif
-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 ifeq ($(CORE), EMAG8180)
-CCOMMON_OPT += -march=armv8-a -mtune=emag
+CCOMMON_OPT += -march=armv8-a
 ifeq  ($(ISCLANG), 0)
 CCOMMON_OPT += -mtune=emag
 endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=emag
 endif
--- a/README.md
+++ b/README.md
@ -9,9 +9,12 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n
 Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS)
 <!-- Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)-->
 [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
 OSUOSL POWERCI [![Build Status](https://powerci.osuosl.org/buildStatus/icon?job=OpenBLAS_gh%2Fdevelop)](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/)
 OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=OpenBLAS-Z%2Fdevelop)](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/)
 ## Introduction
 OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -271,6 +271,19 @@ jobs:
  - script: |
     make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
 - job: OSX_xbuild_DYNAMIC_ARM64
  pool:
     vmImage: 'macOS-11'
  variables:
     CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
  steps:
  - script: |
     ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
     /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
     /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
     make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
 - job: ALPINE_MUSL
  pool:
     vmImage: 'ubuntu-latest'
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
  if (ARM64)
 	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
    if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
-          set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2")
+          set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2)
    endif ()
    if (DYNAMIC_LIST)
 	  set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@ -187,7 +187,7 @@ set(CLASRC
   cposv.f  cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
   cppcon.f cppequ.f cpprfs.f cppsv.f  cppsvx.f cpptrf.f cpptri.f cpptrs.f
   cptcon.f cpteqr.f cptrfs.f cptsv.f  cptsvx.f cpttrf.f cpttrs.f cptts2.f
-   crot.f   cspcon.f csprfs.f cspsv.f
+   crot.f crscl.f cspcon.f csprfs.f cspsv.f
   cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
   cstegr.f cstein.f csteqr.f csycon.f
   csyrfs.f csysv.f  csysvx.f csytf2.f csytrf.f csytri.f
@ -381,7 +381,7 @@ set(ZLASRC
   zposv.f  zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
   zppcon.f zppequ.f zpprfs.f zppsv.f  zppsvx.f zpptrf.f zpptri.f zpptrs.f
   zptcon.f zpteqr.f zptrfs.f zptsv.f  zptsvx.f zpttrf.f zpttrs.f zptts2.f
-   zrot.f   zspcon.f zsprfs.f zspsv.f
+   zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f
   zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
   zstegr.f zstein.f zsteqr.f zsycon.f
   zsyrfs.f zsysv.f  zsysvx.f zsytf2.f zsytrf.f zsytri.f
@ -686,7 +686,7 @@ set(CLASRC
   cposv.c  cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
   cppcon.c cppequ.c cpprfs.c cppsv.c  cppsvx.c cpptrf.c cpptri.c cpptrs.c
   cptcon.c cpteqr.c cptrfs.c cptsv.c  cptsvx.c cpttrf.c cpttrs.c cptts2.c
-   crot.c   cspcon.c csprfs.c cspsv.c
+   crot.c crscl.c  cspcon.c csprfs.c cspsv.c
   cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
   cstegr.c cstein.c csteqr.c csycon.c
   csyrfs.c csysv.c  csysvx.c csytf2.c csytrf.c csytri.c
@ -878,7 +878,7 @@ set(ZLASRC
   zposv.c  zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
   zppcon.c zppequ.c zpprfs.c zppsv.c  zppsvx.c zpptrf.c zpptri.c zpptrs.c
   zptcon.c zpteqr.c zptrfs.c zptsv.c  zptsvx.c zpttrf.c zpttrs.c zptts2.c
-   zrot.c   zspcon.c zsprfs.c zspsv.c
+   zrot.c zrscl.c  zspcon.c zsprfs.c zspsv.c
   zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
   zstegr.c zstein.c zsteqr.c zsycon.c
   zsyrfs.c zsysv.c  zsysvx.c zsytf2.c zsytrf.c zsytri.c
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@ -280,7 +280,29 @@ if (DEFINED TARGET)
  if (${TARGET} STREQUAL POWER8)
    set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
  endif()
 if (${TARGET} STREQUAL NEOVERSEV1)
    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
    else ()
 	    message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
    endif()
  endif()
  if (${TARGET} STREQUAL NEOVERSEN2)
    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
    else ()
 	    message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
    endif()
  endif()
  if (${TARGET} STREQUAL ARMV8SVE)
      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
  endif()
 endif()
 if (DEFINED BINARY)
  message(STATUS "Compiling a ${BINARY}-bit binary.")
 endif ()
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){
  return x / y;
 }
 #ifndef NO_AFFINITY
 static inline int WhereAmI(void){
  int ret = 0, counter = 0;
  __asm__ volatile (
    "rdtimel.w  %[counter],   %[id]"
    : [id]"=r"(ret), [counter]"=r"(counter)
    :
    : "memory"
  );
  return ret;
 }
 #endif
 #ifdef DOUBLE
 #define GET_IMAGE(res)  __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res)  : : "memory")
 #else
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@ -100,13 +100,13 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 8;
-        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 8;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 8;
-        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 8;
 	}
 	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
--- a/kernel/arm64/dot_kernel_sve.c
+++ b/kernel/arm64/dot_kernel_sve.c
@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
        BLASLONG sve_width = SVE_WIDTH;
        for (BLASLONG i = 0; i < n; i += sve_width * 2) {
-                svbool_t pg_a = SVE_WHILELT(i, n);
+                svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
-                svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
+                svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
                SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
                SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
--- a/kernel/arm64/gemm_ncopy_sve_v1x8.c
+++ b/kernel/arm64/gemm_ncopy_sve_v1x8.c
@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
    BLASLONG remaining_n = n - single_vectors_n;
    if (remaining_n) {
        a_offset_inner = a_offset;
-        svbool_t pg = SV_WHILE(0L, remaining_n);
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
        uint64_t active = remaining_n;
        uint64_t i_cnt = m >> 2;
        while (i_cnt--) {
--- a/kernel/arm64/gemm_tcopy_sve_v1x8.c
+++ b/kernel/arm64/gemm_tcopy_sve_v1x8.c
@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
    BLASLONG remaining_n = n - single_vectors_n;
    if (remaining_n) {
        a_offset_inner = a_offset;
-        svbool_t pg = SV_WHILE(0L, remaining_n);
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
        uint64_t active = remaining_n;
        uint64_t i_cnt = m >> 2;
        while (i_cnt--) {
--- a/kernel/arm64/trmm_lncopy_sve_v1.c
+++ b/kernel/arm64/trmm_lncopy_sve_v1.c
@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
    FLOAT *ao;
 #ifdef DOUBLE
    svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
    svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
    do
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        posY += n_active;
        js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b64(svptrue_b64(), pn);
    } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b32(svptrue_b32(), pn);
    } while (svptest_any(svptrue_b32(), pn));
 #endif
--- a/kernel/arm64/trmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/trmm_ltcopy_sve_v1.c
@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
    FLOAT *ao;
    js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
    do
@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        posY += n_active;
        js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b64(svptrue_b64(), pn);
    } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b32(svptrue_b32(), pn);
    } while (svptest_any(svptrue_b32(), pn));
 #endif
--- a/kernel/arm64/trmm_uncopy_sve_v1.c
+++ b/kernel/arm64/trmm_uncopy_sve_v1.c
@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
    FLOAT *ao;
 #ifdef DOUBLE
    svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
    svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
    do
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        posY += n_active;
        js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b64(svptrue_b64(), pn);
    } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b32(svptrue_b32(), pn);
    } while (svptest_any(svptrue_b32(), pn));
 #endif
--- a/kernel/arm64/trmm_utcopy_sve_v1.c
+++ b/kernel/arm64/trmm_utcopy_sve_v1.c
@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
    FLOAT *ao;
    js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
    do
@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        posY += n_active;
        js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b64(svptrue_b64(), pn);
    } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b32(svptrue_b32(), pn);
    } while (svptest_any(svptrue_b32(), pn));
 #endif
--- a/kernel/arm64/trsm_lncopy_sve.c
+++ b/kernel/arm64/trsm_lncopy_sve.c
@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
  int64_t js = 0;
  svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
  int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
  int32_t N = n;
  int32_t js = 0;
  svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
  int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
  do {
@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
    js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
    n_active = svcntp_b64(svptrue_b64(), pn);
  } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
    n_active = svcntp_b32(svptrue_b32(), pn);
  } while (svptest_any(svptrue_b32(), pn));
 #endif
--- a/kernel/generic/ztrsm_utcopy_1.c
+++ b/kernel/generic/ztrsm_utcopy_1.c
@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
  BLASLONG i, ii, j, jj;
-  FLOAT data01, data02;
+  FLOAT data01=0.0, data02=0.0;
  FLOAT *a1;
  lda *= 2;
--- a/kernel/generic/ztrsm_utcopy_2.c
+++ b/kernel/generic/ztrsm_utcopy_2.c
@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
  FLOAT data05, data06, data07, data08;
  FLOAT *a1, *a2;
  data01=data02=data07=data08=0.0;
  lda *= 2;
  jj = offset;
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
@ -58,7 +58,7 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
        if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
            LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
@ -60,7 +60,7 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
        ldc_t = MAX(1,m);
        ldt_t = MAX(1,k);
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
@ -57,7 +57,7 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
        if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
            LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
@ -59,7 +59,7 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans,
        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
        ldc_t = MAX(1,m);
        ldt_t = MAX(1,k);
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
@ -57,7 +57,7 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
        if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
            LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
@ -59,7 +59,7 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans,
        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
        ldc_t = MAX(1,m);
        ldt_t = MAX(1,k);
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
@ -58,7 +58,7 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
        if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
            LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
@ -60,7 +60,7 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans,
        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
        ldc_t = MAX(1,m);
        ldt_t = MAX(1,k);
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@ -280,7 +280,7 @@ CLASRC_O = \
   cposv.o  cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \
   cppcon.o cppequ.o cpprfs.o cppsv.o  cppsvx.o cpptrf.o cpptri.o cpptrs.o \
   cptcon.o cpteqr.o cptrfs.o cptsv.o  cptsvx.o cpttrf.o cpttrs.o cptts2.o \
-   crot.o   cspcon.o cspmv.o  cspr.o   csprfs.o cspsv.o \
+   crot.o crscl.o cspcon.o cspmv.o  cspr.o   csprfs.o cspsv.o \
   cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \
   cstegr.o cstein.o csteqr.o \
   csycon.o csymv.o \
@ -488,7 +488,7 @@ ZLASRC_O = \
   zposv.o  zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \
   zppcon.o zppequ.o zpprfs.o zppsv.o  zppsvx.o zpptrf.o zpptri.o zpptrs.o \
   zptcon.o zpteqr.o zptrfs.o zptsv.o  zptsvx.o zpttrf.o zpttrs.o zptts2.o \
-   zrot.o   zspcon.o zspmv.o  zspr.o   zsprfs.o zspsv.o \
+   zrot.o zrscl.o  zspcon.o zspmv.o  zspr.o   zsprfs.o zspsv.o \
   zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \
   zstegr.o zstein.o zsteqr.o \
   zsycon.o zsymv.o \
--- a/lapack-netlib/SRC/cgelss.f
+++ b/lapack-netlib/SRC/cgelss.f
@ -170,7 +170,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
      SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@ -214,8 +214,7 @@
 *     .. External Subroutines ..
      EXTERNAL           CBDSQR, CCOPY, CGEBRD, CGELQF, CGEMM, CGEMV,
     $                   CGEQRF, CLACPY, CLASCL, CLASET, CSRSCL, CUNGBR,
-     $                   CUNMBR, CUNMLQ, CUNMQR, SLABAD, SLASCL, SLASET,
+     $                   CUNMBR, CUNMLQ, CUNMQR, SLASCL, SLASET, XERBLA
     $                   XERBLA
 *     ..
 *     .. External Functions ..
      INTEGER            ILAENV
@ -388,7 +387,6 @@
      SFMIN = SLAMCH( 'S' )
      SMLNUM = SFMIN / EPS
      BIGNUM = ONE / SMLNUM
      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@ -540,7 +538,7 @@
     $                     LDB, CZERO, WORK, N )
               CALL CLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
   20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL CGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
            CALL CCOPY( N, WORK, 1, B, 1 )
         END IF
@ -645,7 +643,7 @@
               CALL CLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
     $                      LDB )
   40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL CGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ),
     $                  1, CZERO, WORK( IWORK ), 1 )
            CALL CCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@ -737,7 +735,7 @@
     $                     LDB, CZERO, WORK, N )
               CALL CLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
   60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL CGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
            CALL CCOPY( N, WORK, 1, B, 1 )
         END IF
--- a/lapack-netlib/SRC/cgetf2.f
+++ b/lapack-netlib/SRC/cgetf2.f
@ -101,7 +101,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup getf2
 *
 *  =====================================================================
      SUBROUTINE CGETF2( M, N, A, LDA, IPIV, INFO )
@ -126,16 +126,14 @@
     $                   ZERO = ( 0.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
-      REAL               SFMIN
+      INTEGER            J, JP
      INTEGER            I, J, JP
 *     ..
 *     .. External Functions ..
      REAL               SLAMCH
      INTEGER            ICAMAX
-      EXTERNAL           SLAMCH, ICAMAX
+      EXTERNAL           ICAMAX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGERU, CSCAL, CSWAP, XERBLA
+      EXTERNAL           CGERU, CRSCL, CSWAP, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          MAX, MIN
@ -161,10 +159,6 @@
 *
      IF( M.EQ.0 .OR. N.EQ.0 )
     $   RETURN
 *
 *     Compute machine safe minimum
 *
      SFMIN = SLAMCH('S')
 *
      DO 10 J = 1, MIN( M, N )
 *
@ -181,15 +175,8 @@
 *
 *           Compute elements J+1:M of J-th column.
 *
-            IF( J.LT.M ) THEN
+            IF( J.LT.M )
-               IF( ABS(A( J, J )) .GE. SFMIN ) THEN
+     $         CALL CRSCL( M-J, A( J, J ), A( J+1, J ), 1 )
                  CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 )
               ELSE
                  DO 20 I = 1, M-J
                     A( J+I, J ) = A( J+I, J ) / A( J, J )
   20             CONTINUE
               END IF
            END IF
 *
         ELSE IF( INFO.EQ.0 ) THEN
 *
--- a/lapack-netlib/SRC/crscl.c
+++ b/lapack-netlib/SRC/crscl.c
@ -0,0 +1,735 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <complex.h>
 #ifdef complex
 #undef complex
 #endif
 #ifdef I
 #undef I
 #endif
 #if defined(_WIN64)
 typedef long long BLASLONG;
 typedef unsigned long long BLASULONG;
 #else
 typedef long BLASLONG;
 typedef unsigned long BLASULONG;
 #endif
 #ifdef LAPACK_ILP64
 typedef BLASLONG blasint;
 #if defined(_WIN64)
 #define blasabs(x) llabs(x)
 #else
 #define blasabs(x) labs(x)
 #endif
 #else
 typedef int blasint;
 #define blasabs(x) abs(x)
 #endif
 typedef blasint integer;
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
 typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
 #ifdef _MSC_VER
 static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
 static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
 static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
 static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
 #else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
 #endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
 typedef short int shortlogical;
 typedef char logical1;
 typedef char integer1;
 #define TRUE_ (1)
 #define FALSE_ (0)
 /* Extern is for use with -E */
 #ifndef Extern
 #define Extern extern
 #endif
 /* I/O stuff */
 typedef int flag;
 typedef int ftnlen;
 typedef int ftnint;
 /*external read, write*/
 typedef struct
 {	flag cierr;
 	ftnint ciunit;
 	flag ciend;
 	char *cifmt;
 	ftnint cirec;
 } cilist;
 /*internal read, write*/
 typedef struct
 {	flag icierr;
 	char *iciunit;
 	flag iciend;
 	char *icifmt;
 	ftnint icirlen;
 	ftnint icirnum;
 } icilist;
 /*open*/
 typedef struct
 {	flag oerr;
 	ftnint ounit;
 	char *ofnm;
 	ftnlen ofnmlen;
 	char *osta;
 	char *oacc;
 	char *ofm;
 	ftnint orl;
 	char *oblnk;
 } olist;
 /*close*/
 typedef struct
 {	flag cerr;
 	ftnint cunit;
 	char *csta;
 } cllist;
 /*rewind, backspace, endfile*/
 typedef struct
 {	flag aerr;
 	ftnint aunit;
 } alist;
 /* inquire */
 typedef struct
 {	flag inerr;
 	ftnint inunit;
 	char *infile;
 	ftnlen infilen;
 	ftnint	*inex;	/*parameters in standard's order*/
 	ftnint	*inopen;
 	ftnint	*innum;
 	ftnint	*innamed;
 	char	*inname;
 	ftnlen	innamlen;
 	char	*inacc;
 	ftnlen	inacclen;
 	char	*inseq;
 	ftnlen	inseqlen;
 	char 	*indir;
 	ftnlen	indirlen;
 	char	*infmt;
 	ftnlen	infmtlen;
 	char	*inform;
 	ftnint	informlen;
 	char	*inunf;
 	ftnlen	inunflen;
 	ftnint	*inrecl;
 	ftnint	*innrec;
 	char	*inblank;
 	ftnlen	inblanklen;
 } inlist;
 #define VOID void
 union Multitype {	/* for multiple entry points */
 	integer1 g;
 	shortint h;
 	integer i;
 	/* longint j; */
 	real r;
 	doublereal d;
 	complex c;
 	doublecomplex z;
 	};
 typedef union Multitype Multitype;
 struct Vardesc {	/* for Namelist */
 	char *name;
 	char *addr;
 	ftnlen *dims;
 	int  type;
 	};
 typedef struct Vardesc Vardesc;
 struct Namelist {
 	char *name;
 	Vardesc **vars;
 	int nvars;
 	};
 typedef struct Namelist Namelist;
 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (fabs(x))
 #define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
 #define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
 #define dmin(a,b) (f2cmin(a,b))
 #define dmax(a,b) (f2cmax(a,b))
 #define bit_test(a,b)	((a) >> (b) & 1)
 #define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
 #define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
 #ifdef _MSC_VER
 #define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
 #define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
 #else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
 #endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
 //#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
 #define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
 #define d_abs(x) (fabs(*(x)))
 #define d_acos(x) (acos(*(x)))
 #define d_asin(x) (asin(*(x)))
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
 #define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
 #define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
 #define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
 #define d_log(x) (log(*(x)))
 #define d_mod(x, y) (fmod(*(x), *(y)))
 #define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
 #define d_nint(x) u_nint(*(x))
 #define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
 #define d_sign(a,b) u_sign(*(a),*(b))
 #define r_sign(a,b) u_sign(*(a),*(b))
 #define d_sin(x) (sin(*(x)))
 #define d_sinh(x) (sinh(*(x)))
 #define d_sqrt(x) (sqrt(*(x)))
 #define d_tan(x) (tan(*(x)))
 #define d_tanh(x) (tanh(*(x)))
 #define i_abs(x) abs(*(x))
 #define i_dnnt(x) ((integer)u_nint(*(x)))
 #define i_len(s, n) (n)
 #define i_nint(x) ((integer)u_nint(*(x)))
 #define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
 #define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
 #define pow_si(B,E) spow_ui(*(B),*(E))
 #define pow_ri(B,E) spow_ui(*(B),*(E))
 #define pow_di(B,E) dpow_ui(*(B),*(E))
 #define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
 #define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
 #define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
 #define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
 #define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
 static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
 #define myceiling_(w) {ceil(w)}
 #define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
 /* procedure parameter types for -A and -C++ */
 #define F2C_proc_par_types 1
 #ifdef __cplusplus
 typedef logical (*L_fp)(...);
 #else
 typedef logical (*L_fp)();
 #endif
 static float spow_ui(float x, integer n) {
 	float pow=1.0; unsigned long int u;
 	if(n != 0) {
 		if(n < 0) n = -n, x = 1/x;
 		for(u = n; ; ) {
 			if(u & 01) pow *= x;
 			if(u >>= 1) x *= x;
 			else break;
 		}
 	}
 	return pow;
 }
 static double dpow_ui(double x, integer n) {
 	double pow=1.0; unsigned long int u;
 	if(n != 0) {
 		if(n < 0) n = -n, x = 1/x;
 		for(u = n; ; ) {
 			if(u & 01) pow *= x;
 			if(u >>= 1) x *= x;
 			else break;
 		}
 	}
 	return pow;
 }
 #ifdef _MSC_VER
 static _Fcomplex cpow_ui(complex x, integer n) {
 	complex pow={1.0,0.0}; unsigned long int u;
 		if(n != 0) {
 		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
 		for(u = n; ; ) {
 			if(u & 01) pow.r *= x.r, pow.i *= x.i;
 			if(u >>= 1) x.r *= x.r, x.i *= x.i;
 			else break;
 		}
 	}
 	_Fcomplex p={pow.r, pow.i};
 	return p;
 }
 #else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
 		if(n < 0) n = -n, x = 1/x;
 		for(u = n; ; ) {
 			if(u & 01) pow *= x;
 			if(u >>= 1) x *= x;
 			else break;
 		}
 	}
 	return pow;
 }
 #endif
 #ifdef _MSC_VER
 static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
 	_Dcomplex pow={1.0,0.0}; unsigned long int u;
 	if(n != 0) {
 		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
 		for(u = n; ; ) {
 			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
 			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
 			else break;
 		}
 	}
 	_Dcomplex p = {pow._Val[0], pow._Val[1]};
 	return p;
 }
 #else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
 		if(n < 0) n = -n, x = 1/x;
 		for(u = n; ; ) {
 			if(u & 01) pow *= x;
 			if(u >>= 1) x *= x;
 			else break;
 		}
 	}
 	return pow;
 }
 #endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
 		if (n == 0 || x == 1) pow = 1;
 		else if (x != -1) pow = x == 0 ? 1/x : 0;
 		else n = -n;
 	}
 	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
 		u = n;
 		for(pow = 1; ; ) {
 			if(u & 01) pow *= x;
 			if(u >>= 1) x *= x;
 			else break;
 		}
 	}
 	return pow;
 }
 static integer dmaxloc_(double *w, integer s, integer e, integer *n)
 {
 	double m; integer i, mi;
 	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
 		if (w[i-1]>m) mi=i ,m=w[i-1];
 	return mi-s+1;
 }
 static integer smaxloc_(float *w, integer s, integer e, integer *n)
 {
 	float m; integer i, mi;
 	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
 		if (w[i-1]>m) mi=i ,m=w[i-1];
 	return mi-s+1;
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
 #ifdef _MSC_VER
 	_Fcomplex zdotc = {0.0, 0.0};
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
 			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
 			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
 		}
 	}
 	pCf(z) = zdotc;
 }
 #else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
 		}
 	}
 	pCf(z) = zdotc;
 }
 #endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
 #ifdef _MSC_VER
 	_Dcomplex zdotc = {0.0, 0.0};
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
 			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
 			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
 		}
 	}
 	pCd(z) = zdotc;
 }
 #else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
 		}
 	}
 	pCd(z) = zdotc;
 }
 #endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
 #ifdef _MSC_VER
 	_Fcomplex zdotc = {0.0, 0.0};
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
 			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
 			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
 		}
 	}
 	pCf(z) = zdotc;
 }
 #else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += Cf(&x[i]) * Cf(&y[i]);
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
 		}
 	}
 	pCf(z) = zdotc;
 }
 #endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
 #ifdef _MSC_VER
 	_Dcomplex zdotc = {0.0, 0.0};
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
 			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
 			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
 		}
 	}
 	pCd(z) = zdotc;
 }
 #else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += Cd(&x[i]) * Cd(&y[i]);
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
 		}
 	}
 	pCd(z) = zdotc;
 }
 #endif
 /*  -- translated by f2c (version 20000121).
   You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 /*  -- translated by f2c (version 20000121).
   You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 /* > \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar. */
 /*  =========== DOCUMENTATION =========== */
 /* Online html documentation available at */
 /*            http://www.netlib.org/lapack/explore-html/ */
 /* > \htmlonly */
 /* > Download CRSCL + dependencies */
 /* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crscl.f
 "> */
 /* > [TGZ]</a> */
 /* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crscl.f
 "> */
 /* > [ZIP]</a> */
 /* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crscl.f
 "> */
 /* > [TXT]</a> */
 /* > \endhtmlonly */
 /*  Definition: */
 /*  =========== */
 /*       SUBROUTINE CRSCL( N, A, X, INCX ) */
 /*       INTEGER            INCX, N */
 /*       COMPLEX            A */
 /*       COMPLEX            X( * ) */
 /* > \par Purpose: */
 /*  ============= */
 /* > */
 /* > \verbatim */
 /* > */
 /* > CRSCL multiplies an n-element complex vector x by the complex scalar */
 /* > 1/a.  This is done without overflow or underflow as long as */
 /* > the final result x/a does not overflow or underflow. */
 /* > \endverbatim */
 /*  Arguments: */
 /*  ========== */
 /* > \param[in] N */
 /* > \verbatim */
 /* >          N is INTEGER */
 /* >          The number of components of the vector x. */
 /* > \endverbatim */
 /* > */
 /* > \param[in] A */
 /* > \verbatim */
 /* >          A is COMPLEX */
 /* >          The scalar a which is used to divide each component of x. */
 /* >          A must not be 0, or the subroutine will divide by zero. */
 /* > \endverbatim */
 /* > */
 /* > \param[in,out] X */
 /* > \verbatim */
 /* >          X is COMPLEX array, dimension */
 /* >                         (1+(N-1)*abs(INCX)) */
 /* >          The n-element vector x. */
 /* > \endverbatim */
 /* > */
 /* > \param[in] INCX */
 /* > \verbatim */
 /* >          INCX is INTEGER */
 /* >          The increment between successive values of the vector X. */
 /* >          > 0:  X(1) = X(1) and X(1+(i-1)*INCX) = x(i),     1< i<= n */
 /* > \endverbatim */
 /*  Authors: */
 /*  ======== */
 /* > \author Univ. of Tennessee */
 /* > \author Univ. of California Berkeley */
 /* > \author Univ. of Colorado Denver */
 /* > \author NAG Ltd. */
 /* > \ingroup complexOTHERauxiliary */
 /*  ===================================================================== */
 /* Subroutine */ int crscl_(integer *n, complex *a, complex *x, integer *incx)
 {
    /* System generated locals */
    real r__1, r__2;
    complex q__1;
    /* Local variables */
    real absi, absr;
    extern /* Subroutine */ int cscal_(integer *, complex *, complex *, 
 	    integer *);
    real ai, ar, ui, ov, ur;
    extern real slamch_(char *);
    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
 	    *);
    real safmin, safmax;
    extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer 
 	    *);
 /*  -- LAPACK auxiliary routine -- */
 /*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
 /*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
 /* ===================================================================== */
 /*     Quick return if possible */
    /* Parameter adjustments */
    --x;
    /* Function Body */
    if (*n <= 0) {
 	return 0;
    }
 /*     Get machine parameters */
    safmin = slamch_("S");
    safmax = 1.f / safmin;
    ov = slamch_("O");
 /*     Initialize constants related to A. */
    ar = a->r;
    ai = r_imag(a);
    absr = abs(ar);
    absi = abs(ai);
    if (ai == 0.f) {
 /*        If alpha is real, then we can use csrscl */
 	csrscl_(n, &ar, &x[1], incx);
    } else if (ar == 0.f) {
 /*        If alpha has a zero real part, then we follow the same rules as if */
 /*        alpha were real. */
 	if (absi > safmax) {
 	    csscal_(n, &safmin, &x[1], incx);
 	    r__1 = -safmax / ai;
 	    q__1.r = 0.f, q__1.i = r__1;
 	    cscal_(n, &q__1, &x[1], incx);
 	} else if (absi < safmin) {
 	    r__1 = -safmin / ai;
 	    q__1.r = 0.f, q__1.i = r__1;
 	    cscal_(n, &q__1, &x[1], incx);
 	    csscal_(n, &safmax, &x[1], incx);
 	} else {
 	    r__1 = -1.f / ai;
 	    q__1.r = 0.f, q__1.i = r__1;
 	    cscal_(n, &q__1, &x[1], incx);
 	}
    } else {
 /*        The following numbers can be computed. */
 /*        They are the inverse of the real and imaginary parts of 1/alpha. */
 /*        Note that a and b are always different from zero. */
 /*        NaNs are only possible if either: */
 /*        1. alphaR or alphaI is NaN. */
 /*        2. alphaR and alphaI are both infinite, in which case it makes sense */
 /*        to propagate a NaN. */
 	ur = ar + ai * (ai / ar);
 	ui = ai + ar * (ar / ai);
 	if (abs(ur) < safmin || abs(ui) < safmin) {
 /*           This means that both alphaR and alphaI are very small. */
 	    r__1 = safmin / ur;
 	    r__2 = -safmin / ui;
 	    q__1.r = r__1, q__1.i = r__2;
 	    cscal_(n, &q__1, &x[1], incx);
 	    csscal_(n, &safmax, &x[1], incx);
 	} else if (abs(ur) > safmax || abs(ui) > safmax) {
 	    if (absr > ov || absi > ov) {
 /*              This means that a and b are both Inf. No need for scaling. */
 		r__1 = 1.f / ur;
 		r__2 = -1.f / ui;
 		q__1.r = r__1, q__1.i = r__2;
 		cscal_(n, &q__1, &x[1], incx);
 	    } else {
 		csscal_(n, &safmin, &x[1], incx);
 		if (abs(ur) > ov || abs(ui) > ov) {
 /*                 Infs were generated. We do proper scaling to avoid them. */
 		    if (absr >= absi) {
 /*                    ABS( UR ) <= ABS( UI ) */
 			ur = safmin * ar + safmin * (ai * (ai / ar));
 			ui = safmin * ai + ar * (safmin * ar / ai);
 		    } else {
 /*                    ABS( UR ) > ABS( UI ) */
 			ur = safmin * ar + ai * (safmin * ai / ar);
 			ui = safmin * ai + safmin * (ar * (ar / ai));
 		    }
 		    r__1 = 1.f / ur;
 		    r__2 = -1.f / ui;
 		    q__1.r = r__1, q__1.i = r__2;
 		    cscal_(n, &q__1, &x[1], incx);
 		} else {
 		    r__1 = safmax / ur;
 		    r__2 = -safmax / ui;
 		    q__1.r = r__1, q__1.i = r__2;
 		    cscal_(n, &q__1, &x[1], incx);
 		}
 	    }
 	} else {
 	    r__1 = 1.f / ur;
 	    r__2 = -1.f / ui;
 	    q__1.r = r__1, q__1.i = r__2;
 	    cscal_(n, &q__1, &x[1], incx);
 	}
    }
    return 0;
 /*     End of CRSCL */
 } /* crscl_ */
--- a/lapack-netlib/SRC/crscl.f
+++ b/lapack-netlib/SRC/crscl.f
@ -0,0 +1,202 @@
 *> \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar.
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download CRSCL + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crscl.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crscl.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crscl.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE CRSCL( N, A, X, INCX )
 *
 *       .. Scalar Arguments ..
 *       INTEGER            INCX, N
 *       COMPLEX            A
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX            X( * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> CRSCL multiplies an n-element complex vector x by the complex scalar
 *> 1/a.  This is done without overflow or underflow as long as
 *> the final result x/a does not overflow or underflow.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of components of the vector x.
 *> \endverbatim
 *>
 *> \param[in] A
 *> \verbatim
 *>          A is COMPLEX
 *>          The scalar a which is used to divide each component of x.
 *>          A must not be 0, or the subroutine will divide by zero.
 *> \endverbatim
 *>
 *> \param[in,out] X
 *> \verbatim
 *>          X is COMPLEX array, dimension
 *>                         (1+(N-1)*abs(INCX))
 *>          The n-element vector x.
 *> \endverbatim
 *>
 *> \param[in] INCX
 *> \verbatim
 *>          INCX is INTEGER
 *>          The increment between successive values of the vector X.
 *>          > 0:  X(1) = X(1) and X(1+(i-1)*INCX) = x(i),     1< i<= n
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complexOTHERauxiliary
 *
 *  =====================================================================
      SUBROUTINE CRSCL( N, A, X, INCX )
 *
 *  -- LAPACK auxiliary routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER            INCX, N
      COMPLEX            A
 *     ..
 *     .. Array Arguments ..
      COMPLEX            X( * )
 *     ..
 *
 * =====================================================================
 *
 *     .. Parameters ..
      REAL               ZERO, ONE
      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
 *     ..
 *     .. Local Scalars ..
      REAL               SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR
     %                   , UI
 *     ..
 *     .. External Functions ..
      REAL               SLAMCH
      COMPLEX            CLADIV
      EXTERNAL           SLAMCH, CLADIV
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           CSCAL, CSSCAL, CSRSCL
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          ABS
 *     ..
 *     .. Executable Statements ..
 *
 *     Quick return if possible
 *
      IF( N.LE.0 )
     $   RETURN
 *
 *     Get machine parameters
 *
      SAFMIN = SLAMCH( 'S' )
      SAFMAX = ONE / SAFMIN
      OV   = SLAMCH( 'O' )
 *
 *     Initialize constants related to A.
 *
      AR = REAL( A )
      AI = AIMAG( A )
      ABSR = ABS( AR )
      ABSI = ABS( AI )
 *
      IF( AI.EQ.ZERO ) THEN
 *        If alpha is real, then we can use csrscl
         CALL CSRSCL( N, AR, X, INCX )
 *
      ELSE IF( AR.EQ.ZERO ) THEN
 *        If alpha has a zero real part, then we follow the same rules as if
 *        alpha were real.
         IF( ABSI.GT.SAFMAX ) THEN
            CALL CSSCAL( N, SAFMIN, X, INCX )
            CALL CSCAL( N, CMPLX( ZERO, -SAFMAX / AI ), X, INCX )
         ELSE IF( ABSI.LT.SAFMIN ) THEN
            CALL CSCAL( N, CMPLX( ZERO, -SAFMIN / AI ), X, INCX )
            CALL CSSCAL( N, SAFMAX, X, INCX )
         ELSE
            CALL CSCAL( N, CMPLX( ZERO, -ONE / AI ), X, INCX )
         END IF
 *
      ELSE
 *        The following numbers can be computed.
 *        They are the inverse of the real and imaginary parts of 1/alpha.
 *        Note that a and b are always different from zero.
 *        NaNs are only possible if either:
 *        1. alphaR or alphaI is NaN.
 *        2. alphaR and alphaI are both infinite, in which case it makes sense
 *        to propagate a NaN.
         UR = AR + AI * ( AI / AR )
         UI = AI + AR * ( AR / AI )
 *
         IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN
 *           This means that both alphaR and alphaI are very small.
            CALL CSCAL( N, CMPLX( SAFMIN / UR, -SAFMIN / UI ), X, INCX )
            CALL CSSCAL( N, SAFMAX, X, INCX )
         ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN
            IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN
 *              This means that a and b are both Inf. No need for scaling.
               CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
            ELSE
               CALL CSSCAL( N, SAFMIN, X, INCX )
               IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN
 *                 Infs were generated. We do proper scaling to avoid them.
                  IF( ABSR.GE.ABSI ) THEN
 *                    ABS( UR ) <= ABS( UI )
                     UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR ))
                     UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI )
                  ELSE
 *                    ABS( UR ) > ABS( UI )
                     UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR )
                     UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI ))
                  END IF
                  CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
               ELSE
                  CALL CSCAL( N, CMPLX( SAFMAX / UR, -SAFMAX / UI ),
     $                        X, INCX )
               END IF
            END IF
         ELSE
            CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
         END IF
      END IF
 *
      RETURN
 *
 *     End of CRSCL
 *
      END
--- a/lapack-netlib/SRC/cstemr.f
+++ b/lapack-netlib/SRC/cstemr.f
@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@ -329,7 +329,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
 *> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
      SUBROUTINE CSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@ -361,7 +362,8 @@
     $                     MINRGP = 3.0E-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
     $                   LAESWAP
      INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
     $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
     $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@ -397,6 +399,7 @@
 *
      LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
      ZQUERY = ( NZC.EQ.-1 )
      LAESWAP = .FALSE.
 *     SSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N.
@ -519,6 +522,15 @@
         ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
            CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
         END IF
 *        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
 *        the following code requires R1 >= R2. Hence, we correct
 *        the order of R1, R2, CS, SN if R1 < R2 before further processing.
         IF( R1.LT.R2 ) THEN
            E(2) = R1
            R1 = R2
            R2 = E(2)
            LAESWAP = .TRUE.
         ENDIF
         IF( ALLEIG.OR.
     $      (VALEIG.AND.(R2.GT.WL).AND.
     $                  (R2.LE.WU)).OR.
@ -526,8 +538,13 @@
            M = M+1
            W( M ) = R2
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
+               IF( LAESWAP ) THEN
-               Z( 2, M ) = CS
+                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
               ELSE
                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
@ -550,8 +567,13 @@
            M = M+1
            W( M ) = R1
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
+               IF( LAESWAP ) THEN
-               Z( 2, M ) = SN
+                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
               ELSE
                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
--- a/lapack-netlib/SRC/dgelss.f
+++ b/lapack-netlib/SRC/dgelss.f
@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
      SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@ -203,7 +203,7 @@
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DBDSQR, DCOPY, DGEBRD, DGELQF, DGEMM, DGEMV,
-     $                   DGEQRF, DLABAD, DLACPY, DLASCL, DLASET, DORGBR,
+     $                   DGEQRF, DLACPY, DLASCL, DLASET, DORGBR,
     $                   DORMBR, DORMLQ, DORMQR, DRSCL, XERBLA
 *     ..
 *     .. External Functions ..
@ -385,7 +385,6 @@
      SFMIN = DLAMCH( 'S' )
      SMLNUM = SFMIN / EPS
      BIGNUM = ONE / SMLNUM
      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@ -529,7 +528,7 @@
     $                     LDB, ZERO, WORK, N )
               CALL DLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
   20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL DGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
            CALL DCOPY( N, WORK, 1, B, 1 )
         END IF
@ -626,7 +625,7 @@
               CALL DLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
     $                      LDB )
   40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL DGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ),
     $                  1, ZERO, WORK( IWORK ), 1 )
            CALL DCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@ -712,7 +711,7 @@
     $                     LDB, ZERO, WORK, N )
               CALL DLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
   60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL DGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
            CALL DCOPY( N, WORK, 1, B, 1 )
         END IF
--- a/lapack-netlib/SRC/dstemr.f
+++ b/lapack-netlib/SRC/dstemr.f
@ -303,7 +303,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@ -312,7 +312,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
 *> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
      SUBROUTINE DSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@ -344,7 +345,8 @@
     $                     MINRGP = 1.0D-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
     $                   LAESWAP
      INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
     $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
     $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@ -380,6 +382,7 @@
 *
      LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
      ZQUERY = ( NZC.EQ.-1 )
      LAESWAP = .FALSE.
 *     DSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N.
@ -502,6 +505,15 @@
         ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
            CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
         END IF
 *        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
 *        the following code requires R1 >= R2. Hence, we correct
 *        the order of R1, R2, CS, SN if R1 < R2 before further processing.
         IF( R1.LT.R2 ) THEN
            E(2) = R1
            R1 = R2
            R2 = E(2)
            LAESWAP = .TRUE.
         ENDIF
         IF( ALLEIG.OR.
     $      (VALEIG.AND.(R2.GT.WL).AND.
     $                  (R2.LE.WU)).OR.
@ -509,8 +521,13 @@
            M = M+1
            W( M ) = R2
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
+               IF( LAESWAP ) THEN
-               Z( 2, M ) = CS
+                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
               ELSE
                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
@ -533,8 +550,13 @@
            M = M+1
            W( M ) = R1
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
+               IF( LAESWAP ) THEN
-               Z( 2, M ) = SN
+                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
               ELSE
                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
--- a/lapack-netlib/SRC/sgelss.f
+++ b/lapack-netlib/SRC/sgelss.f
@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
      SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@ -202,7 +202,7 @@
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           SBDSQR, SCOPY, SGEBRD, SGELQF, SGEMM, SGEMV,
-     $                   SGEQRF, SLABAD, SLACPY, SLASCL, SLASET, SORGBR,
+     $                   SGEQRF, SLACPY, SLASCL, SLASET, SORGBR,
     $                   SORMBR, SORMLQ, SORMQR, SRSCL, XERBLA
 *     ..
 *     .. External Functions ..
@ -381,7 +381,6 @@
      SFMIN = SLAMCH( 'S' )
      SMLNUM = SFMIN / EPS
      BIGNUM = ONE / SMLNUM
      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@ -525,7 +524,7 @@
     $                     LDB, ZERO, WORK, N )
               CALL SLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
   20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL SGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
            CALL SCOPY( N, WORK, 1, B, 1 )
         END IF
@ -622,7 +621,7 @@
               CALL SLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
     $                      LDB )
   40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL SGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ),
     $                  1, ZERO, WORK( IWORK ), 1 )
            CALL SCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@ -708,7 +707,7 @@
     $                     LDB, ZERO, WORK, N )
               CALL SLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
   60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL SGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
            CALL SCOPY( N, WORK, 1, B, 1 )
         END IF
--- a/lapack-netlib/SRC/sstemr.f
+++ b/lapack-netlib/SRC/sstemr.f
@ -303,7 +303,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@ -312,7 +312,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
 *> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
      SUBROUTINE SSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@ -344,7 +345,8 @@
     $                     MINRGP = 3.0E-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
     $                   LAESWAP
      INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
     $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
     $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@ -378,6 +380,7 @@
 *
      LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
      ZQUERY = ( NZC.EQ.-1 )
      LAESWAP = .FALSE.
 *     SSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N.
@ -500,6 +503,15 @@
         ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
            CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
         END IF
 *        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
 *        the following code requires R1 >= R2. Hence, we correct
 *        the order of R1, R2, CS, SN if R1 < R2 before further processing.
         IF( R1.LT.R2 ) THEN
            E(2) = R1
            R1 = R2
            R2 = E(2)
            LAESWAP = .TRUE.
         ENDIF
         IF( ALLEIG.OR.
     $      (VALEIG.AND.(R2.GT.WL).AND.
     $                  (R2.LE.WU)).OR.
@ -507,8 +519,13 @@
            M = M+1
            W( M ) = R2
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
+               IF( LAESWAP ) THEN
-               Z( 2, M ) = CS
+                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
               ELSE
                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
@ -531,8 +548,13 @@
            M = M+1
            W( M ) = R1
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
+               IF( LAESWAP ) THEN
-               Z( 2, M ) = SN
+                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
               ELSE
                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
--- a/lapack-netlib/SRC/zgelss.f
+++ b/lapack-netlib/SRC/zgelss.f
@ -170,7 +170,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
      SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@ -212,10 +212,9 @@
      COMPLEX*16         DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY,
+      EXTERNAL           DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY, ZDRSCL,
-     $                   ZDRSCL, ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF,
+     $                   ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF, ZLACPY,
-     $                   ZLACPY, ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ,
+     $                   ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ
     $                   ZUNMQR
 *     ..
 *     .. External Functions ..
      INTEGER            ILAENV
@ -388,7 +387,6 @@
      SFMIN = DLAMCH( 'S' )
      SMLNUM = SFMIN / EPS
      BIGNUM = ONE / SMLNUM
      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@ -540,7 +538,7 @@
     $                     LDB, CZERO, WORK, N )
               CALL ZLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
   20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL ZGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
            CALL ZCOPY( N, WORK, 1, B, 1 )
         END IF
@ -645,7 +643,7 @@
               CALL ZLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
     $                      LDB )
   40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL ZGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ),
     $                  1, CZERO, WORK( IWORK ), 1 )
            CALL ZCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@ -737,7 +735,7 @@
     $                     LDB, CZERO, WORK, N )
               CALL ZLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
   60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL ZGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
            CALL ZCOPY( N, WORK, 1, B, 1 )
         END IF
--- a/lapack-netlib/SRC/zgetf2.f
+++ b/lapack-netlib/SRC/zgetf2.f
@ -101,7 +101,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup getf2
 *
 *  =====================================================================
      SUBROUTINE ZGETF2( M, N, A, LDA, IPIV, INFO )
@ -127,7 +127,7 @@
 *     ..
 *     .. Local Scalars ..
      DOUBLE PRECISION   SFMIN
-      INTEGER            I, J, JP
+      INTEGER            J, JP
 *     ..
 *     .. External Functions ..
      DOUBLE PRECISION   DLAMCH
@ -135,7 +135,7 @@
      EXTERNAL           DLAMCH, IZAMAX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZGERU, ZSCAL, ZSWAP
+      EXTERNAL           XERBLA, ZGERU, ZRSCL, ZSWAP
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          MAX, MIN
@ -181,15 +181,8 @@
 *
 *           Compute elements J+1:M of J-th column.
 *
-            IF( J.LT.M ) THEN
+            IF( J.LT.M )
-               IF( ABS(A( J, J )) .GE. SFMIN ) THEN
+     $         CALL ZRSCL( M-J, A( J, J ), A( J+1, J ), 1 )
                  CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 )
               ELSE
                  DO 20 I = 1, M-J
                     A( J+I, J ) = A( J+I, J ) / A( J, J )
   20             CONTINUE
               END IF
            END IF
 *
         ELSE IF( INFO.EQ.0 ) THEN
 *
--- a/lapack-netlib/SRC/zrscl.c
+++ b/lapack-netlib/SRC/zrscl.c
@ -0,0 +1,735 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <complex.h>
 #ifdef complex
 #undef complex
 #endif
 #ifdef I
 #undef I
 #endif
 #if defined(_WIN64)
 typedef long long BLASLONG;
 typedef unsigned long long BLASULONG;
 #else
 typedef long BLASLONG;
 typedef unsigned long BLASULONG;
 #endif
 #ifdef LAPACK_ILP64
 typedef BLASLONG blasint;
 #if defined(_WIN64)
 #define blasabs(x) llabs(x)
 #else
 #define blasabs(x) labs(x)
 #endif
 #else
 typedef int blasint;
 #define blasabs(x) abs(x)
 #endif
 typedef blasint integer;
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
 typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
 #ifdef _MSC_VER
 static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
 static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
 static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
 static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
 #else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
 #endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
 typedef short int shortlogical;
 typedef char logical1;
 typedef char integer1;
 #define TRUE_ (1)
 #define FALSE_ (0)
 /* Extern is for use with -E */
 #ifndef Extern
 #define Extern extern
 #endif
 /* I/O stuff */
 typedef int flag;
 typedef int ftnlen;
 typedef int ftnint;
 /*external read, write*/
 typedef struct
 {	flag cierr;
 	ftnint ciunit;
 	flag ciend;
 	char *cifmt;
 	ftnint cirec;
 } cilist;
 /*internal read, write*/
 typedef struct
 {	flag icierr;
 	char *iciunit;
 	flag iciend;
 	char *icifmt;
 	ftnint icirlen;
 	ftnint icirnum;
 } icilist;
 /*open*/
 typedef struct
 {	flag oerr;
 	ftnint ounit;
 	char *ofnm;
 	ftnlen ofnmlen;
 	char *osta;
 	char *oacc;
 	char *ofm;
 	ftnint orl;
 	char *oblnk;
 } olist;
 /*close*/
 typedef struct
 {	flag cerr;
 	ftnint cunit;
 	char *csta;
 } cllist;
 /*rewind, backspace, endfile*/
 typedef struct
 {	flag aerr;
 	ftnint aunit;
 } alist;
 /* inquire */
 typedef struct
 {	flag inerr;
 	ftnint inunit;
 	char *infile;
 	ftnlen infilen;
 	ftnint	*inex;	/*parameters in standard's order*/
 	ftnint	*inopen;
 	ftnint	*innum;
 	ftnint	*innamed;
 	char	*inname;
 	ftnlen	innamlen;
 	char	*inacc;
 	ftnlen	inacclen;
 	char	*inseq;
 	ftnlen	inseqlen;
 	char 	*indir;
 	ftnlen	indirlen;
 	char	*infmt;
 	ftnlen	infmtlen;
 	char	*inform;
 	ftnint	informlen;
 	char	*inunf;
 	ftnlen	inunflen;
 	ftnint	*inrecl;
 	ftnint	*innrec;
 	char	*inblank;
 	ftnlen	inblanklen;
 } inlist;
 #define VOID void
 union Multitype {	/* for multiple entry points */
 	integer1 g;
 	shortint h;
 	integer i;
 	/* longint j; */
 	real r;
 	doublereal d;
 	complex c;
 	doublecomplex z;
 	};
 typedef union Multitype Multitype;
 struct Vardesc {	/* for Namelist */
 	char *name;
 	char *addr;
 	ftnlen *dims;
 	int  type;
 	};
 typedef struct Vardesc Vardesc;
 struct Namelist {
 	char *name;
 	Vardesc **vars;
 	int nvars;
 	};
 typedef struct Namelist Namelist;
 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (fabs(x))
 #define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
 #define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
 #define dmin(a,b) (f2cmin(a,b))
 #define dmax(a,b) (f2cmax(a,b))
 #define bit_test(a,b)	((a) >> (b) & 1)
 #define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
 #define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
 #ifdef _MSC_VER
 #define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
 #define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
 #else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
 #endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
 //#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
 #define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
 #define d_abs(x) (fabs(*(x)))
 #define d_acos(x) (acos(*(x)))
 #define d_asin(x) (asin(*(x)))
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
 #define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
 #define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
 #define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
 #define d_log(x) (log(*(x)))
 #define d_mod(x, y) (fmod(*(x), *(y)))
 #define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
 #define d_nint(x) u_nint(*(x))
 #define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
 #define d_sign(a,b) u_sign(*(a),*(b))
 #define r_sign(a,b) u_sign(*(a),*(b))
 #define d_sin(x) (sin(*(x)))
 #define d_sinh(x) (sinh(*(x)))
 #define d_sqrt(x) (sqrt(*(x)))
 #define d_tan(x) (tan(*(x)))
 #define d_tanh(x) (tanh(*(x)))
 #define i_abs(x) abs(*(x))
 #define i_dnnt(x) ((integer)u_nint(*(x)))
 #define i_len(s, n) (n)
 #define i_nint(x) ((integer)u_nint(*(x)))
 #define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
 #define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
 #define pow_si(B,E) spow_ui(*(B),*(E))
 #define pow_ri(B,E) spow_ui(*(B),*(E))
 #define pow_di(B,E) dpow_ui(*(B),*(E))
 #define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
 #define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
 #define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
 #define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
 #define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
 static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
 #define myceiling_(w) {ceil(w)}
 #define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
 /* procedure parameter types for -A and -C++ */
 #define F2C_proc_par_types 1
 #ifdef __cplusplus
 typedef logical (*L_fp)(...);
 #else
 typedef logical (*L_fp)();
 #endif
 static float spow_ui(float x, integer n) {
 	float pow=1.0; unsigned long int u;
 	if(n != 0) {
 		if(n < 0) n = -n, x = 1/x;
 		for(u = n; ; ) {
 			if(u & 01) pow *= x;
 			if(u >>= 1) x *= x;
 			else break;
 		}
 	}
 	return pow;
 }
 static double dpow_ui(double x, integer n) {
 	double pow=1.0; unsigned long int u;
 	if(n != 0) {
 		if(n < 0) n = -n, x = 1/x;
 		for(u = n; ; ) {
 			if(u & 01) pow *= x;
 			if(u >>= 1) x *= x;
 			else break;
 		}
 	}
 	return pow;
 }
 #ifdef _MSC_VER
 static _Fcomplex cpow_ui(complex x, integer n) {
 	complex pow={1.0,0.0}; unsigned long int u;
 		if(n != 0) {
 		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
 		for(u = n; ; ) {
 			if(u & 01) pow.r *= x.r, pow.i *= x.i;
 			if(u >>= 1) x.r *= x.r, x.i *= x.i;
 			else break;
 		}
 	}
 	_Fcomplex p={pow.r, pow.i};
 	return p;
 }
 #else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
 		if(n < 0) n = -n, x = 1/x;
 		for(u = n; ; ) {
 			if(u & 01) pow *= x;
 			if(u >>= 1) x *= x;
 			else break;
 		}
 	}
 	return pow;
 }
 #endif
 #ifdef _MSC_VER
 static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
 	_Dcomplex pow={1.0,0.0}; unsigned long int u;
 	if(n != 0) {
 		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
 		for(u = n; ; ) {
 			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
 			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
 			else break;
 		}
 	}
 	_Dcomplex p = {pow._Val[0], pow._Val[1]};
 	return p;
 }
 #else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
 		if(n < 0) n = -n, x = 1/x;
 		for(u = n; ; ) {
 			if(u & 01) pow *= x;
 			if(u >>= 1) x *= x;
 			else break;
 		}
 	}
 	return pow;
 }
 #endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
 		if (n == 0 || x == 1) pow = 1;
 		else if (x != -1) pow = x == 0 ? 1/x : 0;
 		else n = -n;
 	}
 	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
 		u = n;
 		for(pow = 1; ; ) {
 			if(u & 01) pow *= x;
 			if(u >>= 1) x *= x;
 			else break;
 		}
 	}
 	return pow;
 }
 static integer dmaxloc_(double *w, integer s, integer e, integer *n)
 {
 	double m; integer i, mi;
 	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
 		if (w[i-1]>m) mi=i ,m=w[i-1];
 	return mi-s+1;
 }
 static integer smaxloc_(float *w, integer s, integer e, integer *n)
 {
 	float m; integer i, mi;
 	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
 		if (w[i-1]>m) mi=i ,m=w[i-1];
 	return mi-s+1;
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
 #ifdef _MSC_VER
 	_Fcomplex zdotc = {0.0, 0.0};
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
 			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
 			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
 		}
 	}
 	pCf(z) = zdotc;
 }
 #else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
 		}
 	}
 	pCf(z) = zdotc;
 }
 #endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
 #ifdef _MSC_VER
 	_Dcomplex zdotc = {0.0, 0.0};
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
 			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
 			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
 		}
 	}
 	pCd(z) = zdotc;
 }
 #else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
 		}
 	}
 	pCd(z) = zdotc;
 }
 #endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
 #ifdef _MSC_VER
 	_Fcomplex zdotc = {0.0, 0.0};
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
 			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
 			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
 		}
 	}
 	pCf(z) = zdotc;
 }
 #else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += Cf(&x[i]) * Cf(&y[i]);
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
 		}
 	}
 	pCf(z) = zdotc;
 }
 #endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
 #ifdef _MSC_VER
 	_Dcomplex zdotc = {0.0, 0.0};
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
 			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
 			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
 		}
 	}
 	pCd(z) = zdotc;
 }
 #else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += Cd(&x[i]) * Cd(&y[i]);
 		}
 	} else {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
 			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
 		}
 	}
 	pCd(z) = zdotc;
 }
 #endif
 /*  -- translated by f2c (version 20000121).
   You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 /*  -- translated by f2c (version 20000121).
   You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 /* > \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar. */
 /*  =========== DOCUMENTATION =========== */
 /* Online html documentation available at */
 /*            http://www.netlib.org/lapack/explore-html/ */
 /* > \htmlonly */
 /* > Download ZDRSCL + dependencies */
 /* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zdrscl.
 f"> */
 /* > [TGZ]</a> */
 /* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zdrscl.
 f"> */
 /* > [ZIP]</a> */
 /* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zdrscl.
 f"> */
 /* > [TXT]</a> */
 /* > \endhtmlonly */
 /*  Definition: */
 /*  =========== */
 /*       SUBROUTINE ZRSCL( N, A, X, INCX ) */
 /*       INTEGER            INCX, N */
 /*       COMPLEX*16         A */
 /*       COMPLEX*16         X( * ) */
 /* > \par Purpose: */
 /*  ============= */
 /* > */
 /* > \verbatim */
 /* > */
 /* > ZRSCL multiplies an n-element complex vector x by the complex scalar */
 /* > 1/a.  This is done without overflow or underflow as long as */
 /* > the final result x/a does not overflow or underflow. */
 /* > \endverbatim */
 /*  Arguments: */
 /*  ========== */
 /* > \param[in] N */
 /* > \verbatim */
 /* >          N is INTEGER */
 /* >          The number of components of the vector x. */
 /* > \endverbatim */
 /* > */
 /* > \param[in] A */
 /* > \verbatim */
 /* >          A is COMPLEX*16 */
 /* >          The scalar a which is used to divide each component of x. */
 /* >          A must not be 0, or the subroutine will divide by zero. */
 /* > \endverbatim */
 /* > */
 /* > \param[in,out] X */
 /* > \verbatim */
 /* >          X is COMPLEX*16 array, dimension */
 /* >                         (1+(N-1)*abs(INCX)) */
 /* >          The n-element vector x. */
 /* > \endverbatim */
 /* > */
 /* > \param[in] INCX */
 /* > \verbatim */
 /* >          INCX is INTEGER */
 /* >          The increment between successive values of the vector SX. */
 /* >          > 0:  SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i),     1< i<= n */
 /* > \endverbatim */
 /*  Authors: */
 /*  ======== */
 /* > \author Univ. of Tennessee */
 /* > \author Univ. of California Berkeley */
 /* > \author Univ. of Colorado Denver */
 /* > \author NAG Ltd. */
 /* > \ingroup complex16OTHERauxiliary */
 /*  ===================================================================== */
 /* Subroutine */ int zrscl_(integer *n, doublecomplex *a, doublecomplex *x, 
 	integer *incx)
 {
    /* System generated locals */
    doublereal d__1, d__2;
    doublecomplex z__1;
    /* Local variables */
    doublereal absi, absr;
    extern /* Subroutine */ int zscal_(integer *, doublecomplex *, 
 	    doublecomplex *, integer *);
    doublereal ai, ar;
    extern doublereal dlamch_(char *);
    doublereal ui, ov, ur, safmin, safmax;
    extern /* Subroutine */ int zdscal_(integer *, doublereal *, 
 	    doublecomplex *, integer *), zdrscl_(integer *, doublereal *, 
 	    doublecomplex *, integer *);
 /*  -- LAPACK auxiliary routine -- */
 /*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
 /*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
 /* ===================================================================== */
 /*     Quick return if possible */
    /* Parameter adjustments */
    --x;
    /* Function Body */
    if (*n <= 0) {
 	return 0;
    }
 /*     Get machine parameters */
    safmin = dlamch_("S");
    safmax = 1. / safmin;
    ov = dlamch_("O");
 /*     Initialize constants related to A. */
    ar = a->r;
    ai = d_imag(a);
    absr = abs(ar);
    absi = abs(ai);
    if (ai == 0.) {
 /*        If alpha is real, then we can use csrscl */
 	zdrscl_(n, &ar, &x[1], incx);
    } else if (ar == 0.) {
 /*        If alpha has a zero real part, then we follow the same rules as if */
 /*        alpha were real. */
 	if (absi > safmax) {
 	    zdscal_(n, &safmin, &x[1], incx);
 	    d__1 = -safmax / ai;
 	    z__1.r = 0., z__1.i = d__1;
 	    zscal_(n, &z__1, &x[1], incx);
 	} else if (absi < safmin) {
 	    d__1 = -safmin / ai;
 	    z__1.r = 0., z__1.i = d__1;
 	    zscal_(n, &z__1, &x[1], incx);
 	    zdscal_(n, &safmax, &x[1], incx);
 	} else {
 	    d__1 = -1. / ai;
 	    z__1.r = 0., z__1.i = d__1;
 	    zscal_(n, &z__1, &x[1], incx);
 	}
    } else {
 /*        The following numbers can be computed. */
 /*        They are the inverse of the real and imaginary parts of 1/alpha. */
 /*        Note that a and b are always different from zero. */
 /*        NaNs are only possible if either: */
 /*        1. alphaR or alphaI is NaN. */
 /*        2. alphaR and alphaI are both infinite, in which case it makes sense */
 /*        to propagate a NaN. */
 	ur = ar + ai * (ai / ar);
 	ui = ai + ar * (ar / ai);
 	if (abs(ur) < safmin || abs(ui) < safmin) {
 /*           This means that both alphaR and alphaI are very small. */
 	    d__1 = safmin / ur;
 	    d__2 = -safmin / ui;
 	    z__1.r = d__1, z__1.i = d__2;
 	    zscal_(n, &z__1, &x[1], incx);
 	    zdscal_(n, &safmax, &x[1], incx);
 	} else if (abs(ur) > safmax || abs(ui) > safmax) {
 	    if (absr > ov || absi > ov) {
 /*              This means that a and b are both Inf. No need for scaling. */
 		d__1 = 1. / ur;
 		d__2 = -1. / ui;
 		z__1.r = d__1, z__1.i = d__2;
 		zscal_(n, &z__1, &x[1], incx);
 	    } else {
 		zdscal_(n, &safmin, &x[1], incx);
 		if (abs(ur) > ov || abs(ui) > ov) {
 /*                 Infs were generated. We do proper scaling to avoid them. */
 		    if (absr >= absi) {
 /*                    ABS( UR ) <= ABS( UI ) */
 			ur = safmin * ar + safmin * (ai * (ai / ar));
 			ui = safmin * ai + ar * (safmin * ar / ai);
 		    } else {
 /*                    ABS( UR ) > ABS( UI ) */
 			ur = safmin * ar + ai * (safmin * ai / ar);
 			ui = safmin * ai + safmin * (ar * (ar / ai));
 		    }
 		    d__1 = 1. / ur;
 		    d__2 = -1. / ui;
 		    z__1.r = d__1, z__1.i = d__2;
 		    zscal_(n, &z__1, &x[1], incx);
 		} else {
 		    d__1 = safmax / ur;
 		    d__2 = -safmax / ui;
 		    z__1.r = d__1, z__1.i = d__2;
 		    zscal_(n, &z__1, &x[1], incx);
 		}
 	    }
 	} else {
 	    d__1 = 1. / ur;
 	    d__2 = -1. / ui;
 	    z__1.r = d__1, z__1.i = d__2;
 	    zscal_(n, &z__1, &x[1], incx);
 	}
    }
    return 0;
 /*     End of ZRSCL */
 } /* zrscl_ */
--- a/lapack-netlib/SRC/zrscl.f
+++ b/lapack-netlib/SRC/zrscl.f
@ -0,0 +1,203 @@
 *> \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar.
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download ZDRSCL + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zdrscl.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zdrscl.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zdrscl.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE ZRSCL( N, A, X, INCX )
 *
 *       .. Scalar Arguments ..
 *       INTEGER            INCX, N
 *       COMPLEX*16         A
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX*16         X( * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> ZRSCL multiplies an n-element complex vector x by the complex scalar
 *> 1/a.  This is done without overflow or underflow as long as
 *> the final result x/a does not overflow or underflow.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of components of the vector x.
 *> \endverbatim
 *>
 *> \param[in] A
 *> \verbatim
 *>          A is COMPLEX*16
 *>          The scalar a which is used to divide each component of x.
 *>          A must not be 0, or the subroutine will divide by zero.
 *> \endverbatim
 *>
 *> \param[in,out] X
 *> \verbatim
 *>          X is COMPLEX*16 array, dimension
 *>                         (1+(N-1)*abs(INCX))
 *>          The n-element vector x.
 *> \endverbatim
 *>
 *> \param[in] INCX
 *> \verbatim
 *>          INCX is INTEGER
 *>          The increment between successive values of the vector SX.
 *>          > 0:  SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i),     1< i<= n
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complex16OTHERauxiliary
 *
 *  =====================================================================
      SUBROUTINE ZRSCL( N, A, X, INCX )
 *
 *  -- LAPACK auxiliary routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER            INCX, N
      COMPLEX*16         A
 *     ..
 *     .. Array Arguments ..
      COMPLEX*16         X( * )
 *     ..
 *
 * =====================================================================
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ZERO, ONE
      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
 *     ..
 *     .. Local Scalars ..
      DOUBLE PRECISION   SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR, UI
 *     ..
 *     .. External Functions ..
      DOUBLE PRECISION   DLAMCH
      COMPLEX*16         ZLADIV
      EXTERNAL           DLAMCH, ZLADIV
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DSCAL, ZDSCAL, ZDRSCL
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          ABS
 *     ..
 *     .. Executable Statements ..
 *
 *     Quick return if possible
 *
      IF( N.LE.0 )
     $   RETURN
 *
 *     Get machine parameters
 *
      SAFMIN = DLAMCH( 'S' )
      SAFMAX = ONE / SAFMIN
      OV   = DLAMCH( 'O' )
 *
 *     Initialize constants related to A.
 *
      AR = DBLE( A )
      AI = DIMAG( A )
      ABSR = ABS( AR )
      ABSI = ABS( AI )
 *
      IF( AI.EQ.ZERO ) THEN
 *        If alpha is real, then we can use csrscl
         CALL ZDRSCL( N, AR, X, INCX )
 *
      ELSE IF( AR.EQ.ZERO ) THEN
 *        If alpha has a zero real part, then we follow the same rules as if
 *        alpha were real.
         IF( ABSI.GT.SAFMAX ) THEN
            CALL ZDSCAL( N, SAFMIN, X, INCX )
            CALL ZSCAL( N, DCMPLX( ZERO, -SAFMAX / AI ), X, INCX )
         ELSE IF( ABSI.LT.SAFMIN ) THEN
            CALL ZSCAL( N, DCMPLX( ZERO, -SAFMIN / AI ), X, INCX )
            CALL ZDSCAL( N, SAFMAX, X, INCX )
         ELSE
            CALL ZSCAL( N, DCMPLX( ZERO, -ONE / AI ), X, INCX )
         END IF
 *
      ELSE
 *        The following numbers can be computed.
 *        They are the inverse of the real and imaginary parts of 1/alpha.
 *        Note that a and b are always different from zero.
 *        NaNs are only possible if either:
 *        1. alphaR or alphaI is NaN.
 *        2. alphaR and alphaI are both infinite, in which case it makes sense
 *        to propagate a NaN.
         UR = AR + AI * ( AI / AR )
         UI = AI + AR * ( AR / AI )
 *
         IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN
 *           This means that both alphaR and alphaI are very small.
            CALL ZSCAL( N, DCMPLX( SAFMIN / UR, -SAFMIN / UI ), X,
     $                  INCX )
            CALL ZDSCAL( N, SAFMAX, X, INCX )
         ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN
            IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN
 *              This means that a and b are both Inf. No need for scaling.
               CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX )
            ELSE
               CALL ZDSCAL( N, SAFMIN, X, INCX )
               IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN
 *                 Infs were generated. We do proper scaling to avoid them.
                  IF( ABSR.GE.ABSI ) THEN
 *                    ABS( UR ) <= ABS( UI )
                     UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR ))
                     UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI )
                  ELSE
 *                    ABS( UR ) > ABS( UI )
                     UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR )
                     UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI ))
                  END IF
                  CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X,
     $                        INCX )
               ELSE
                  CALL ZSCAL( N, DCMPLX( SAFMAX / UR, -SAFMAX / UI ),
     $                        X, INCX )
               END IF
            END IF
         ELSE
            CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX )
         END IF
      END IF
 *
      RETURN
 *
 *     End of ZRSCL
 *
      END
--- a/lapack-netlib/SRC/zstemr.f
+++ b/lapack-netlib/SRC/zstemr.f
@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@ -330,6 +330,7 @@
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
 *> Christof Voemel, University of California, Berkeley, USA \n
 *> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
      SUBROUTINE ZSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@ -361,7 +362,8 @@
     $                     MINRGP = 1.0D-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
     $                   LAESWAP
      INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
     $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
     $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@ -397,6 +399,7 @@
 *
      LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
      ZQUERY = ( NZC.EQ.-1 )
      LAESWAP = .FALSE.
 *     DSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N.
@ -519,6 +522,15 @@
         ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
            CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
         END IF
 *        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
 *        the following code requires R1 >= R2. Hence, we correct
 *        the order of R1, R2, CS, SN if R1 < R2 before further processing.
         IF( R1.LT.R2 ) THEN
            E(2) = R1
            R1 = R2
            R2 = E(2)
            LAESWAP = .TRUE.
         ENDIF
         IF( ALLEIG.OR.
     $      (VALEIG.AND.(R2.GT.WL).AND.
     $                  (R2.LE.WU)).OR.
@ -526,8 +538,13 @@
            M = M+1
            W( M ) = R2
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
+               IF( LAESWAP ) THEN
-               Z( 2, M ) = CS
+                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
               ELSE
                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
@ -550,8 +567,13 @@
            M = M+1
            W( M ) = R1
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
+               IF( LAESWAP ) THEN
-               Z( 2, M ) = SN
+                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
               ELSE
                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN