Merge branch 'xianyi:develop' into issue4128

2023-07-14 13:33:52 +02:00 · 2023-07-14 13:33:52 +02:00 · a6e41900fd
parent 0f015d63f2 49077e7bde
commit a6e41900fd
41 changed files with 2143 additions and 141 deletions
--- a/Makefile.arm64
+++ b/Makefile.arm64
@ -69,7 +69,7 @@ endif
 # in GCC>=9
 ifeq ($(CORE), NEOVERSEN1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
@ -92,9 +92,14 @@ endif
 # in GCC>=10.4
 ifeq ($(CORE), NEOVERSEV1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
-CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8.4-a+sve
+ifeq (1, $(ISCLANG))
+CCOMMON_OPT += -mtune=cortex-x1
+else
+CCOMMON_OPT += -mtune=neoverse-v1
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
 endif
@ -122,8 +127,8 @@ endif
 # in GCC>=10.4
 ifeq ($(CORE), NEOVERSEN2)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
 ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 else
@ -155,7 +160,7 @@ endif
 # Use a53 tunings because a55 is only available in GCC>=8.1
 ifeq ($(CORE), CORTEXA55)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ8), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
@ -196,8 +201,13 @@ endif
 endif

 ifeq ($(CORE), THUNDERX3T110)
-ifeq ($(GCCVERSIONGTEQ10), 1)
-CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8.3-a 
+ifeq (0, $(ISCLANG))
+CCOMMON_OPT += -mtune=thunderx3t110
+else
+CCOMMON_OPT += -mtune=thunderx2t99
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
 endif
@ -225,9 +235,12 @@ endif
 endif
 endif

-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 ifeq ($(CORE), EMAG8180)
-CCOMMON_OPT += -march=armv8-a -mtune=emag
+CCOMMON_OPT += -march=armv8-a
+ifeq  ($(ISCLANG), 0)
+CCOMMON_OPT += -mtune=emag
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=emag
 endif
--- a/README.md
+++ b/README.md
@ -9,9 +9,12 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n
 Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS)
 <!-- Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)-->

+
 [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)

+OSUOSL POWERCI [![Build Status](https://powerci.osuosl.org/buildStatus/icon?job=OpenBLAS_gh%2Fdevelop)](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/)

+OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=OpenBLAS-Z%2Fdevelop)](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/)
 ## Introduction

 OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -271,6 +271,19 @@ jobs:
  - script: |
     make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1

+- job: OSX_xbuild_DYNAMIC_ARM64
+  pool:
+     vmImage: 'macOS-11'
+  variables:
+     CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
+  steps:
+  - script: |
+     ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
+     /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
+     /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
+     make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
+
 - job: ALPINE_MUSL
  pool:
     vmImage: 'ubuntu-latest'
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
  if (ARM64)
 	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
    if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
-          set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2")
+          set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2)
    endif ()
    if (DYNAMIC_LIST)
 	  set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@ -187,7 +187,7 @@ set(CLASRC
   cposv.f  cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
   cppcon.f cppequ.f cpprfs.f cppsv.f  cppsvx.f cpptrf.f cpptri.f cpptrs.f
   cptcon.f cpteqr.f cptrfs.f cptsv.f  cptsvx.f cpttrf.f cpttrs.f cptts2.f
-   crot.f   cspcon.f csprfs.f cspsv.f
+   crot.f crscl.f cspcon.f csprfs.f cspsv.f
   cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
   cstegr.f cstein.f csteqr.f csycon.f
   csyrfs.f csysv.f  csysvx.f csytf2.f csytrf.f csytri.f
@ -381,7 +381,7 @@ set(ZLASRC
   zposv.f  zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
   zppcon.f zppequ.f zpprfs.f zppsv.f  zppsvx.f zpptrf.f zpptri.f zpptrs.f
   zptcon.f zpteqr.f zptrfs.f zptsv.f  zptsvx.f zpttrf.f zpttrs.f zptts2.f
-   zrot.f   zspcon.f zsprfs.f zspsv.f
+   zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f
   zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
   zstegr.f zstein.f zsteqr.f zsycon.f
   zsyrfs.f zsysv.f  zsysvx.f zsytf2.f zsytrf.f zsytri.f
@ -686,7 +686,7 @@ set(CLASRC
   cposv.c  cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
   cppcon.c cppequ.c cpprfs.c cppsv.c  cppsvx.c cpptrf.c cpptri.c cpptrs.c
   cptcon.c cpteqr.c cptrfs.c cptsv.c  cptsvx.c cpttrf.c cpttrs.c cptts2.c
-   crot.c   cspcon.c csprfs.c cspsv.c
+   crot.c crscl.c  cspcon.c csprfs.c cspsv.c
   cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
   cstegr.c cstein.c csteqr.c csycon.c
   csyrfs.c csysv.c  csysvx.c csytf2.c csytrf.c csytri.c
@ -878,7 +878,7 @@ set(ZLASRC
   zposv.c  zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
   zppcon.c zppequ.c zpprfs.c zppsv.c  zppsvx.c zpptrf.c zpptri.c zpptrs.c
   zptcon.c zpteqr.c zptrfs.c zptsv.c  zptsvx.c zpttrf.c zpttrs.c zptts2.c
-   zrot.c   zspcon.c zsprfs.c zspsv.c
+   zrot.c zrscl.c  zspcon.c zsprfs.c zspsv.c
   zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
   zstegr.c zstein.c zsteqr.c zsycon.c
   zsyrfs.c zsysv.c  zsysvx.c zsytf2.c zsytrf.c zsytri.c
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@ -280,7 +280,29 @@ if (DEFINED TARGET)
  if (${TARGET} STREQUAL POWER8)
    set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
  endif()
+
+if (${TARGET} STREQUAL NEOVERSEV1)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
+    else ()
+	    message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
+    endif()
+  endif()
+  if (${TARGET} STREQUAL NEOVERSEN2)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    else ()
+	    message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
+    endif()
+  endif()
+  if (${TARGET} STREQUAL ARMV8SVE)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
+  endif()
+
 endif()
+
 if (DEFINED BINARY)
  message(STATUS "Compiling a ${BINARY}-bit binary.")
 endif ()
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){
  return x / y;
 }

+#ifndef NO_AFFINITY
+static inline int WhereAmI(void){
+  int ret = 0, counter = 0;
+  __asm__ volatile (
+    "rdtimel.w  %[counter],   %[id]"
+    : [id]"=r"(ret), [counter]"=r"(counter)
+    :
+    : "memory"
+  );
+  return ret;
+}
+#endif
+
 #ifdef DOUBLE
 #define GET_IMAGE(res)  __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res)  : : "memory")
 #else
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@ -100,13 +100,13 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,

 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 8;
+        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 8;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 8;
+        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 8;
 	}

 	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
--- a/kernel/arm64/dot_kernel_sve.c
+++ b/kernel/arm64/dot_kernel_sve.c
@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
        BLASLONG sve_width = SVE_WIDTH;

        for (BLASLONG i = 0; i < n; i += sve_width * 2) {
-                svbool_t pg_a = SVE_WHILELT(i, n);
-                svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
+                svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
+                svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);

                SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
                SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
--- a/kernel/arm64/gemm_ncopy_sve_v1x8.c
+++ b/kernel/arm64/gemm_ncopy_sve_v1x8.c
@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
    BLASLONG remaining_n = n - single_vectors_n;
    if (remaining_n) {
        a_offset_inner = a_offset;
-        svbool_t pg = SV_WHILE(0L, remaining_n);
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
        uint64_t active = remaining_n;
        uint64_t i_cnt = m >> 2;
        while (i_cnt--) {
--- a/kernel/arm64/gemm_tcopy_sve_v1x8.c
+++ b/kernel/arm64/gemm_tcopy_sve_v1x8.c
@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
    BLASLONG remaining_n = n - single_vectors_n;
    if (remaining_n) {
        a_offset_inner = a_offset;
-        svbool_t pg = SV_WHILE(0L, remaining_n);
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
        uint64_t active = remaining_n;
        uint64_t i_cnt = m >> 2;
        while (i_cnt--) {
--- a/kernel/arm64/trmm_lncopy_sve_v1.c
+++ b/kernel/arm64/trmm_lncopy_sve_v1.c
@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
    FLOAT *ao;
 #ifdef DOUBLE
    svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
    svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
    do
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        posY += n_active;
        js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b64(svptrue_b64(), pn);
    } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b32(svptrue_b32(), pn);
    } while (svptest_any(svptrue_b32(), pn));
 #endif
--- a/kernel/arm64/trmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/trmm_ltcopy_sve_v1.c
@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
    FLOAT *ao;
    js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
    do
@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        posY += n_active;
        js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b64(svptrue_b64(), pn);
    } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b32(svptrue_b32(), pn);
    } while (svptest_any(svptrue_b32(), pn));
 #endif
--- a/kernel/arm64/trmm_uncopy_sve_v1.c
+++ b/kernel/arm64/trmm_uncopy_sve_v1.c
@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
    FLOAT *ao;
 #ifdef DOUBLE
    svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
    svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
    do
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        posY += n_active;
        js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b64(svptrue_b64(), pn);
    } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b32(svptrue_b32(), pn);
    } while (svptest_any(svptrue_b32(), pn));
 #endif
--- a/kernel/arm64/trmm_utcopy_sve_v1.c
+++ b/kernel/arm64/trmm_utcopy_sve_v1.c
@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
    FLOAT *ao;
    js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
    int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
    do
@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        posY += n_active;
        js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b64(svptrue_b64(), pn);
    } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
        n_active = svcntp_b32(svptrue_b32(), pn);
    } while (svptest_any(svptrue_b32(), pn));
 #endif
--- a/kernel/arm64/trsm_lncopy_sve.c
+++ b/kernel/arm64/trsm_lncopy_sve.c
@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
  int64_t js = 0;
  svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
  int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
  int32_t N = n;
  int32_t js = 0;
  svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
  int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
  do {
@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT

    js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
    n_active = svcntp_b64(svptrue_b64(), pn);
  } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
    n_active = svcntp_b32(svptrue_b32(), pn);
  } while (svptest_any(svptrue_b32(), pn));
 #endif
--- a/kernel/generic/ztrsm_utcopy_1.c
+++ b/kernel/generic/ztrsm_utcopy_1.c
@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT

  BLASLONG i, ii, j, jj;

-  FLOAT data01, data02;
+  FLOAT data01=0.0, data02=0.0;
  FLOAT *a1;

  lda *= 2;
--- a/kernel/generic/ztrsm_utcopy_2.c
+++ b/kernel/generic/ztrsm_utcopy_2.c
@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
  FLOAT data05, data06, data07, data08;
  FLOAT *a1, *a2;

+  data01=data02=data07=data08=0.0;
  lda *= 2;

  jj = offset;
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
@ -58,7 +58,7 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct

        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';

        if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
            LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
@ -60,7 +60,7 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,

        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';

        ldc_t = MAX(1,m);
        ldt_t = MAX(1,k);
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
@ -57,7 +57,7 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct

        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';

        if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
            LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
@ -59,7 +59,7 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans,

        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';

        ldc_t = MAX(1,m);
        ldt_t = MAX(1,k);
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
@ -57,7 +57,7 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct

        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';

        if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
            LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
@ -59,7 +59,7 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans,

        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';

        ldc_t = MAX(1,m);
        ldt_t = MAX(1,k);
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
@ -58,7 +58,7 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct

        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';

        if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
            LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
@ -60,7 +60,7 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans,

        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';

        ldc_t = MAX(1,m);
        ldt_t = MAX(1,k);
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@ -280,7 +280,7 @@ CLASRC_O = \
   cposv.o  cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \
   cppcon.o cppequ.o cpprfs.o cppsv.o  cppsvx.o cpptrf.o cpptri.o cpptrs.o \
   cptcon.o cpteqr.o cptrfs.o cptsv.o  cptsvx.o cpttrf.o cpttrs.o cptts2.o \
-   crot.o   cspcon.o cspmv.o  cspr.o   csprfs.o cspsv.o \
+   crot.o crscl.o cspcon.o cspmv.o  cspr.o   csprfs.o cspsv.o \
   cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \
   cstegr.o cstein.o csteqr.o \
   csycon.o csymv.o \
@ -488,7 +488,7 @@ ZLASRC_O = \
   zposv.o  zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \
   zppcon.o zppequ.o zpprfs.o zppsv.o  zppsvx.o zpptrf.o zpptri.o zpptrs.o \
   zptcon.o zpteqr.o zptrfs.o zptsv.o  zptsvx.o zpttrf.o zpttrs.o zptts2.o \
-   zrot.o   zspcon.o zspmv.o  zspr.o   zsprfs.o zspsv.o \
+   zrot.o zrscl.o  zspcon.o zspmv.o  zspr.o   zsprfs.o zspsv.o \
   zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \
   zstegr.o zstein.o zsteqr.o \
   zsycon.o zsymv.o \
--- a/lapack-netlib/SRC/cgelss.f
+++ b/lapack-netlib/SRC/cgelss.f
@ -170,7 +170,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
      SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@ -214,8 +214,7 @@
 *     .. External Subroutines ..
      EXTERNAL           CBDSQR, CCOPY, CGEBRD, CGELQF, CGEMM, CGEMV,
     $                   CGEQRF, CLACPY, CLASCL, CLASET, CSRSCL, CUNGBR,
-     $                   CUNMBR, CUNMLQ, CUNMQR, SLABAD, SLASCL, SLASET,
-     $                   XERBLA
+     $                   CUNMBR, CUNMLQ, CUNMQR, SLASCL, SLASET, XERBLA
 *     ..
 *     .. External Functions ..
      INTEGER            ILAENV
@ -388,7 +387,6 @@
      SFMIN = SLAMCH( 'S' )
      SMLNUM = SFMIN / EPS
      BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@ -540,7 +538,7 @@
     $                     LDB, CZERO, WORK, N )
               CALL CLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
   20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL CGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
            CALL CCOPY( N, WORK, 1, B, 1 )
         END IF
@ -645,7 +643,7 @@
               CALL CLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
     $                      LDB )
   40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL CGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ),
     $                  1, CZERO, WORK( IWORK ), 1 )
            CALL CCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@ -737,7 +735,7 @@
     $                     LDB, CZERO, WORK, N )
               CALL CLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
   60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL CGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
            CALL CCOPY( N, WORK, 1, B, 1 )
         END IF
--- a/lapack-netlib/SRC/cgetf2.f
+++ b/lapack-netlib/SRC/cgetf2.f
@ -101,7 +101,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup getf2
 *
 *  =====================================================================
      SUBROUTINE CGETF2( M, N, A, LDA, IPIV, INFO )
@ -126,16 +126,14 @@
     $                   ZERO = ( 0.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
-      REAL               SFMIN
-      INTEGER            I, J, JP
+      INTEGER            J, JP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMCH
      INTEGER            ICAMAX
-      EXTERNAL           SLAMCH, ICAMAX
+      EXTERNAL           ICAMAX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGERU, CSCAL, CSWAP, XERBLA
+      EXTERNAL           CGERU, CRSCL, CSWAP, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          MAX, MIN
@ -161,10 +159,6 @@
 *
      IF( M.EQ.0 .OR. N.EQ.0 )
     $   RETURN
-*
-*     Compute machine safe minimum
-*
-      SFMIN = SLAMCH('S')
 *
      DO 10 J = 1, MIN( M, N )
 *
@ -181,15 +175,8 @@
 *
 *           Compute elements J+1:M of J-th column.
 *
-            IF( J.LT.M ) THEN
-               IF( ABS(A( J, J )) .GE. SFMIN ) THEN
-                  CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 )
-               ELSE
-                  DO 20 I = 1, M-J
-                     A( J+I, J ) = A( J+I, J ) / A( J, J )
-   20             CONTINUE
-               END IF
-            END IF
+            IF( J.LT.M )
+     $         CALL CRSCL( M-J, A( J, J ), A( J+1, J ), 1 )
 *
         ELSE IF( INFO.EQ.0 ) THEN
 *
--- a/lapack-netlib/SRC/crscl.c
+++ b/lapack-netlib/SRC/crscl.c
@ -0,0 +1,735 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* > \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar. */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download CRSCL + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CRSCL( N, A, X, INCX ) */
+
+/*       INTEGER            INCX, N */
+/*       COMPLEX            A */
+/*       COMPLEX            X( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > CRSCL multiplies an n-element complex vector x by the complex scalar */
+/* > 1/a.  This is done without overflow or underflow as long as */
+/* > the final result x/a does not overflow or underflow. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of components of the vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX */
+/* >          The scalar a which is used to divide each component of x. */
+/* >          A must not be 0, or the subroutine will divide by zero. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is COMPLEX array, dimension */
+/* >                         (1+(N-1)*abs(INCX)) */
+/* >          The n-element vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of the vector X. */
+/* >          > 0:  X(1) = X(1) and X(1+(i-1)*INCX) = x(i),     1< i<= n */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complexOTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int crscl_(integer *n, complex *a, complex *x, integer *incx)
+{
+    /* System generated locals */
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    real absi, absr;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *, 
+	    integer *);
+    real ai, ar, ui, ov, ur;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
+	    *);
+    real safmin, safmax;
+    extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer 
+	    *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/* ===================================================================== */
+
+
+/*     Quick return if possible */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    safmin = slamch_("S");
+    safmax = 1.f / safmin;
+    ov = slamch_("O");
+
+/*     Initialize constants related to A. */
+
+    ar = a->r;
+    ai = r_imag(a);
+    absr = abs(ar);
+    absi = abs(ai);
+
+    if (ai == 0.f) {
+/*        If alpha is real, then we can use csrscl */
+	csrscl_(n, &ar, &x[1], incx);
+
+    } else if (ar == 0.f) {
+/*        If alpha has a zero real part, then we follow the same rules as if */
+/*        alpha were real. */
+	if (absi > safmax) {
+	    csscal_(n, &safmin, &x[1], incx);
+	    r__1 = -safmax / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	} else if (absi < safmin) {
+	    r__1 = -safmin / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	    csscal_(n, &safmax, &x[1], incx);
+	} else {
+	    r__1 = -1.f / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	}
+
+    } else {
+/*        The following numbers can be computed. */
+/*        They are the inverse of the real and imaginary parts of 1/alpha. */
+/*        Note that a and b are always different from zero. */
+/*        NaNs are only possible if either: */
+/*        1. alphaR or alphaI is NaN. */
+/*        2. alphaR and alphaI are both infinite, in which case it makes sense */
+/*        to propagate a NaN. */
+	ur = ar + ai * (ai / ar);
+	ui = ai + ar * (ar / ai);
+
+	if (abs(ur) < safmin || abs(ui) < safmin) {
+/*           This means that both alphaR and alphaI are very small. */
+	    r__1 = safmin / ur;
+	    r__2 = -safmin / ui;
+	    q__1.r = r__1, q__1.i = r__2;
+	    cscal_(n, &q__1, &x[1], incx);
+	    csscal_(n, &safmax, &x[1], incx);
+	} else if (abs(ur) > safmax || abs(ui) > safmax) {
+	    if (absr > ov || absi > ov) {
+/*              This means that a and b are both Inf. No need for scaling. */
+		r__1 = 1.f / ur;
+		r__2 = -1.f / ui;
+		q__1.r = r__1, q__1.i = r__2;
+		cscal_(n, &q__1, &x[1], incx);
+	    } else {
+		csscal_(n, &safmin, &x[1], incx);
+		if (abs(ur) > ov || abs(ui) > ov) {
+/*                 Infs were generated. We do proper scaling to avoid them. */
+		    if (absr >= absi) {
+/*                    ABS( UR ) <= ABS( UI ) */
+			ur = safmin * ar + safmin * (ai * (ai / ar));
+			ui = safmin * ai + ar * (safmin * ar / ai);
+		    } else {
+/*                    ABS( UR ) > ABS( UI ) */
+			ur = safmin * ar + ai * (safmin * ai / ar);
+			ui = safmin * ai + safmin * (ar * (ar / ai));
+		    }
+		    r__1 = 1.f / ur;
+		    r__2 = -1.f / ui;
+		    q__1.r = r__1, q__1.i = r__2;
+		    cscal_(n, &q__1, &x[1], incx);
+		} else {
+		    r__1 = safmax / ur;
+		    r__2 = -safmax / ui;
+		    q__1.r = r__1, q__1.i = r__2;
+		    cscal_(n, &q__1, &x[1], incx);
+		}
+	    }
+	} else {
+	    r__1 = 1.f / ur;
+	    r__2 = -1.f / ui;
+	    q__1.r = r__1, q__1.i = r__2;
+	    cscal_(n, &q__1, &x[1], incx);
+	}
+    }
+
+    return 0;
+
+/*     End of CRSCL */
+
+} /* crscl_ */
+
--- a/lapack-netlib/SRC/crscl.f
+++ b/lapack-netlib/SRC/crscl.f
@ -0,0 +1,202 @@
+*> \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CRSCL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crscl.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crscl.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crscl.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CRSCL( N, A, X, INCX )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       COMPLEX            A
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            X( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CRSCL multiplies an n-element complex vector x by the complex scalar
+*> 1/a.  This is done without overflow or underflow as long as
+*> the final result x/a does not overflow or underflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of components of the vector x.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX
+*>          The scalar a which is used to divide each component of x.
+*>          A must not be 0, or the subroutine will divide by zero.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension
+*>                         (1+(N-1)*abs(INCX))
+*>          The n-element vector x.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of the vector X.
+*>          > 0:  X(1) = X(1) and X(1+(i-1)*INCX) = x(i),     1< i<= n
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE CRSCL( N, A, X, INCX )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      COMPLEX            A
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            X( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      REAL               SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR
+     %                   , UI
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH
+      COMPLEX            CLADIV
+      EXTERNAL           SLAMCH, CLADIV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CSCAL, CSSCAL, CSRSCL
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.LE.0 )
+     $   RETURN
+*
+*     Get machine parameters
+*
+      SAFMIN = SLAMCH( 'S' )
+      SAFMAX = ONE / SAFMIN
+      OV   = SLAMCH( 'O' )
+*
+*     Initialize constants related to A.
+*
+      AR = REAL( A )
+      AI = AIMAG( A )
+      ABSR = ABS( AR )
+      ABSI = ABS( AI )
+*
+      IF( AI.EQ.ZERO ) THEN
+*        If alpha is real, then we can use csrscl
+         CALL CSRSCL( N, AR, X, INCX )
+*
+      ELSE IF( AR.EQ.ZERO ) THEN
+*        If alpha has a zero real part, then we follow the same rules as if
+*        alpha were real.
+         IF( ABSI.GT.SAFMAX ) THEN
+            CALL CSSCAL( N, SAFMIN, X, INCX )
+            CALL CSCAL( N, CMPLX( ZERO, -SAFMAX / AI ), X, INCX )
+         ELSE IF( ABSI.LT.SAFMIN ) THEN
+            CALL CSCAL( N, CMPLX( ZERO, -SAFMIN / AI ), X, INCX )
+            CALL CSSCAL( N, SAFMAX, X, INCX )
+         ELSE
+            CALL CSCAL( N, CMPLX( ZERO, -ONE / AI ), X, INCX )
+         END IF
+*
+      ELSE
+*        The following numbers can be computed.
+*        They are the inverse of the real and imaginary parts of 1/alpha.
+*        Note that a and b are always different from zero.
+*        NaNs are only possible if either:
+*        1. alphaR or alphaI is NaN.
+*        2. alphaR and alphaI are both infinite, in which case it makes sense
+*        to propagate a NaN.
+         UR = AR + AI * ( AI / AR )
+         UI = AI + AR * ( AR / AI )
+*
+         IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN
+*           This means that both alphaR and alphaI are very small.
+            CALL CSCAL( N, CMPLX( SAFMIN / UR, -SAFMIN / UI ), X, INCX )
+            CALL CSSCAL( N, SAFMAX, X, INCX )
+         ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN
+            IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN
+*              This means that a and b are both Inf. No need for scaling.
+               CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+            ELSE
+               CALL CSSCAL( N, SAFMIN, X, INCX )
+               IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN
+*                 Infs were generated. We do proper scaling to avoid them.
+                  IF( ABSR.GE.ABSI ) THEN
+*                    ABS( UR ) <= ABS( UI )
+                     UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR ))
+                     UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI )
+                  ELSE
+*                    ABS( UR ) > ABS( UI )
+                     UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR )
+                     UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI ))
+                  END IF
+                  CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+               ELSE
+                  CALL CSCAL( N, CMPLX( SAFMAX / UR, -SAFMAX / UI ),
+     $                        X, INCX )
+               END IF
+            END IF
+         ELSE
+            CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CRSCL
+*
+      END
--- a/lapack-netlib/SRC/cstemr.f
+++ b/lapack-netlib/SRC/cstemr.f
@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@ -329,7 +329,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
      SUBROUTINE CSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@ -361,7 +362,8 @@
     $                     MINRGP = 3.0E-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
      INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
     $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
     $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@ -397,6 +399,7 @@
 *
      LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
      ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.

 *     SSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N.
@ -519,6 +522,15 @@
         ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
            CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
         END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
         IF( ALLEIG.OR.
     $      (VALEIG.AND.(R2.GT.WL).AND.
     $                  (R2.LE.WU)).OR.
@ -526,8 +538,13 @@
            M = M+1
            W( M ) = R2
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
@ -550,8 +567,13 @@
            M = M+1
            W( M ) = R1
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
--- a/lapack-netlib/SRC/dgelss.f
+++ b/lapack-netlib/SRC/dgelss.f
@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
      SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@ -203,7 +203,7 @@
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DBDSQR, DCOPY, DGEBRD, DGELQF, DGEMM, DGEMV,
-     $                   DGEQRF, DLABAD, DLACPY, DLASCL, DLASET, DORGBR,
+     $                   DGEQRF, DLACPY, DLASCL, DLASET, DORGBR,
     $                   DORMBR, DORMLQ, DORMQR, DRSCL, XERBLA
 *     ..
 *     .. External Functions ..
@ -385,7 +385,6 @@
      SFMIN = DLAMCH( 'S' )
      SMLNUM = SFMIN / EPS
      BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@ -529,7 +528,7 @@
     $                     LDB, ZERO, WORK, N )
               CALL DLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
   20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL DGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
            CALL DCOPY( N, WORK, 1, B, 1 )
         END IF
@ -626,7 +625,7 @@
               CALL DLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
     $                      LDB )
   40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL DGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ),
     $                  1, ZERO, WORK( IWORK ), 1 )
            CALL DCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@ -712,7 +711,7 @@
     $                     LDB, ZERO, WORK, N )
               CALL DLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
   60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL DGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
            CALL DCOPY( N, WORK, 1, B, 1 )
         END IF
--- a/lapack-netlib/SRC/dstemr.f
+++ b/lapack-netlib/SRC/dstemr.f
@ -303,7 +303,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@ -312,7 +312,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
      SUBROUTINE DSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@ -344,7 +345,8 @@
     $                     MINRGP = 1.0D-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
      INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
     $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
     $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@ -380,6 +382,7 @@
 *
      LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
      ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.

 *     DSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N.
@ -502,6 +505,15 @@
         ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
            CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
         END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
         IF( ALLEIG.OR.
     $      (VALEIG.AND.(R2.GT.WL).AND.
     $                  (R2.LE.WU)).OR.
@ -509,8 +521,13 @@
            M = M+1
            W( M ) = R2
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
@ -533,8 +550,13 @@
            M = M+1
            W( M ) = R1
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
--- a/lapack-netlib/SRC/sgelss.f
+++ b/lapack-netlib/SRC/sgelss.f
@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
      SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@ -202,7 +202,7 @@
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           SBDSQR, SCOPY, SGEBRD, SGELQF, SGEMM, SGEMV,
-     $                   SGEQRF, SLABAD, SLACPY, SLASCL, SLASET, SORGBR,
+     $                   SGEQRF, SLACPY, SLASCL, SLASET, SORGBR,
     $                   SORMBR, SORMLQ, SORMQR, SRSCL, XERBLA
 *     ..
 *     .. External Functions ..
@ -381,7 +381,6 @@
      SFMIN = SLAMCH( 'S' )
      SMLNUM = SFMIN / EPS
      BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@ -525,7 +524,7 @@
     $                     LDB, ZERO, WORK, N )
               CALL SLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
   20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL SGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
            CALL SCOPY( N, WORK, 1, B, 1 )
         END IF
@ -622,7 +621,7 @@
               CALL SLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
     $                      LDB )
   40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL SGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ),
     $                  1, ZERO, WORK( IWORK ), 1 )
            CALL SCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@ -708,7 +707,7 @@
     $                     LDB, ZERO, WORK, N )
               CALL SLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
   60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL SGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
            CALL SCOPY( N, WORK, 1, B, 1 )
         END IF
--- a/lapack-netlib/SRC/sstemr.f
+++ b/lapack-netlib/SRC/sstemr.f
@ -303,7 +303,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@ -312,7 +312,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
      SUBROUTINE SSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@ -344,7 +345,8 @@
     $                     MINRGP = 3.0E-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
      INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
     $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
     $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@ -378,6 +380,7 @@
 *
      LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
      ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.

 *     SSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N.
@ -500,6 +503,15 @@
         ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
            CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
         END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
         IF( ALLEIG.OR.
     $      (VALEIG.AND.(R2.GT.WL).AND.
     $                  (R2.LE.WU)).OR.
@ -507,8 +519,13 @@
            M = M+1
            W( M ) = R2
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
@ -531,8 +548,13 @@
            M = M+1
            W( M ) = R1
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
--- a/lapack-netlib/SRC/zgelss.f
+++ b/lapack-netlib/SRC/zgelss.f
@ -170,7 +170,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
      SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@ -212,10 +212,9 @@
      COMPLEX*16         DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY,
-     $                   ZDRSCL, ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF,
-     $                   ZLACPY, ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ,
-     $                   ZUNMQR
+      EXTERNAL           DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY, ZDRSCL,
+     $                   ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF, ZLACPY,
+     $                   ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ
 *     ..
 *     .. External Functions ..
      INTEGER            ILAENV
@ -388,7 +387,6 @@
      SFMIN = DLAMCH( 'S' )
      SMLNUM = SFMIN / EPS
      BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@ -540,7 +538,7 @@
     $                     LDB, CZERO, WORK, N )
               CALL ZLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
   20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL ZGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
            CALL ZCOPY( N, WORK, 1, B, 1 )
         END IF
@ -645,7 +643,7 @@
               CALL ZLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
     $                      LDB )
   40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL ZGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ),
     $                  1, CZERO, WORK( IWORK ), 1 )
            CALL ZCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@ -737,7 +735,7 @@
     $                     LDB, CZERO, WORK, N )
               CALL ZLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
   60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
            CALL ZGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
            CALL ZCOPY( N, WORK, 1, B, 1 )
         END IF
--- a/lapack-netlib/SRC/zgetf2.f
+++ b/lapack-netlib/SRC/zgetf2.f
@ -101,7 +101,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup getf2
 *
 *  =====================================================================
      SUBROUTINE ZGETF2( M, N, A, LDA, IPIV, INFO )
@ -127,7 +127,7 @@
 *     ..
 *     .. Local Scalars ..
      DOUBLE PRECISION   SFMIN
-      INTEGER            I, J, JP
+      INTEGER            J, JP
 *     ..
 *     .. External Functions ..
      DOUBLE PRECISION   DLAMCH
@ -135,7 +135,7 @@
      EXTERNAL           DLAMCH, IZAMAX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZGERU, ZSCAL, ZSWAP
+      EXTERNAL           XERBLA, ZGERU, ZRSCL, ZSWAP
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          MAX, MIN
@ -181,15 +181,8 @@
 *
 *           Compute elements J+1:M of J-th column.
 *
-            IF( J.LT.M ) THEN
-               IF( ABS(A( J, J )) .GE. SFMIN ) THEN
-                  CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 )
-               ELSE
-                  DO 20 I = 1, M-J
-                     A( J+I, J ) = A( J+I, J ) / A( J, J )
-   20             CONTINUE
-               END IF
-            END IF
+            IF( J.LT.M )
+     $         CALL ZRSCL( M-J, A( J, J ), A( J+1, J ), 1 )
 *
         ELSE IF( INFO.EQ.0 ) THEN
 *
--- a/lapack-netlib/SRC/zrscl.c
+++ b/lapack-netlib/SRC/zrscl.c
@ -0,0 +1,735 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* > \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar. */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download ZDRSCL + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZRSCL( N, A, X, INCX ) */
+
+/*       INTEGER            INCX, N */
+/*       COMPLEX*16         A */
+/*       COMPLEX*16         X( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > ZRSCL multiplies an n-element complex vector x by the complex scalar */
+/* > 1/a.  This is done without overflow or underflow as long as */
+/* > the final result x/a does not overflow or underflow. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of components of the vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 */
+/* >          The scalar a which is used to divide each component of x. */
+/* >          A must not be 0, or the subroutine will divide by zero. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is COMPLEX*16 array, dimension */
+/* >                         (1+(N-1)*abs(INCX)) */
+/* >          The n-element vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of the vector SX. */
+/* >          > 0:  SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i),     1< i<= n */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16OTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int zrscl_(integer *n, doublecomplex *a, doublecomplex *x, 
+	integer *incx)
+{
+    /* System generated locals */
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal absi, absr;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *);
+    doublereal ai, ar;
+    extern doublereal dlamch_(char *);
+    doublereal ui, ov, ur, safmin, safmax;
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *, 
+	    doublecomplex *, integer *), zdrscl_(integer *, doublereal *, 
+	    doublecomplex *, integer *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/* ===================================================================== */
+
+
+/*     Quick return if possible */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    safmin = dlamch_("S");
+    safmax = 1. / safmin;
+    ov = dlamch_("O");
+
+/*     Initialize constants related to A. */
+
+    ar = a->r;
+    ai = d_imag(a);
+    absr = abs(ar);
+    absi = abs(ai);
+
+    if (ai == 0.) {
+/*        If alpha is real, then we can use csrscl */
+	zdrscl_(n, &ar, &x[1], incx);
+
+    } else if (ar == 0.) {
+/*        If alpha has a zero real part, then we follow the same rules as if */
+/*        alpha were real. */
+	if (absi > safmax) {
+	    zdscal_(n, &safmin, &x[1], incx);
+	    d__1 = -safmax / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	} else if (absi < safmin) {
+	    d__1 = -safmin / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	    zdscal_(n, &safmax, &x[1], incx);
+	} else {
+	    d__1 = -1. / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	}
+
+    } else {
+/*        The following numbers can be computed. */
+/*        They are the inverse of the real and imaginary parts of 1/alpha. */
+/*        Note that a and b are always different from zero. */
+/*        NaNs are only possible if either: */
+/*        1. alphaR or alphaI is NaN. */
+/*        2. alphaR and alphaI are both infinite, in which case it makes sense */
+/*        to propagate a NaN. */
+	ur = ar + ai * (ai / ar);
+	ui = ai + ar * (ar / ai);
+
+	if (abs(ur) < safmin || abs(ui) < safmin) {
+/*           This means that both alphaR and alphaI are very small. */
+	    d__1 = safmin / ur;
+	    d__2 = -safmin / ui;
+	    z__1.r = d__1, z__1.i = d__2;
+	    zscal_(n, &z__1, &x[1], incx);
+	    zdscal_(n, &safmax, &x[1], incx);
+	} else if (abs(ur) > safmax || abs(ui) > safmax) {
+	    if (absr > ov || absi > ov) {
+/*              This means that a and b are both Inf. No need for scaling. */
+		d__1 = 1. / ur;
+		d__2 = -1. / ui;
+		z__1.r = d__1, z__1.i = d__2;
+		zscal_(n, &z__1, &x[1], incx);
+	    } else {
+		zdscal_(n, &safmin, &x[1], incx);
+		if (abs(ur) > ov || abs(ui) > ov) {
+/*                 Infs were generated. We do proper scaling to avoid them. */
+		    if (absr >= absi) {
+/*                    ABS( UR ) <= ABS( UI ) */
+			ur = safmin * ar + safmin * (ai * (ai / ar));
+			ui = safmin * ai + ar * (safmin * ar / ai);
+		    } else {
+/*                    ABS( UR ) > ABS( UI ) */
+			ur = safmin * ar + ai * (safmin * ai / ar);
+			ui = safmin * ai + safmin * (ar * (ar / ai));
+		    }
+		    d__1 = 1. / ur;
+		    d__2 = -1. / ui;
+		    z__1.r = d__1, z__1.i = d__2;
+		    zscal_(n, &z__1, &x[1], incx);
+		} else {
+		    d__1 = safmax / ur;
+		    d__2 = -safmax / ui;
+		    z__1.r = d__1, z__1.i = d__2;
+		    zscal_(n, &z__1, &x[1], incx);
+		}
+	    }
+	} else {
+	    d__1 = 1. / ur;
+	    d__2 = -1. / ui;
+	    z__1.r = d__1, z__1.i = d__2;
+	    zscal_(n, &z__1, &x[1], incx);
+	}
+    }
+
+    return 0;
+
+/*     End of ZRSCL */
+
+} /* zrscl_ */
+
--- a/lapack-netlib/SRC/zrscl.f
+++ b/lapack-netlib/SRC/zrscl.f
@ -0,0 +1,203 @@
+*> \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZDRSCL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zdrscl.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zdrscl.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zdrscl.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZRSCL( N, A, X, INCX )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       COMPLEX*16         A
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         X( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZRSCL multiplies an n-element complex vector x by the complex scalar
+*> 1/a.  This is done without overflow or underflow as long as
+*> the final result x/a does not overflow or underflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of components of the vector x.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX*16
+*>          The scalar a which is used to divide each component of x.
+*>          A must not be 0, or the subroutine will divide by zero.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension
+*>                         (1+(N-1)*abs(INCX))
+*>          The n-element vector x.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of the vector SX.
+*>          > 0:  SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i),     1< i<= n
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE ZRSCL( N, A, X, INCX )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      COMPLEX*16         A
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         X( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION   SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR, UI
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH
+      COMPLEX*16         ZLADIV
+      EXTERNAL           DLAMCH, ZLADIV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DSCAL, ZDSCAL, ZDRSCL
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.LE.0 )
+     $   RETURN
+*
+*     Get machine parameters
+*
+      SAFMIN = DLAMCH( 'S' )
+      SAFMAX = ONE / SAFMIN
+      OV   = DLAMCH( 'O' )
+*
+*     Initialize constants related to A.
+*
+      AR = DBLE( A )
+      AI = DIMAG( A )
+      ABSR = ABS( AR )
+      ABSI = ABS( AI )
+*
+      IF( AI.EQ.ZERO ) THEN
+*        If alpha is real, then we can use csrscl
+         CALL ZDRSCL( N, AR, X, INCX )
+*
+      ELSE IF( AR.EQ.ZERO ) THEN
+*        If alpha has a zero real part, then we follow the same rules as if
+*        alpha were real.
+         IF( ABSI.GT.SAFMAX ) THEN
+            CALL ZDSCAL( N, SAFMIN, X, INCX )
+            CALL ZSCAL( N, DCMPLX( ZERO, -SAFMAX / AI ), X, INCX )
+         ELSE IF( ABSI.LT.SAFMIN ) THEN
+            CALL ZSCAL( N, DCMPLX( ZERO, -SAFMIN / AI ), X, INCX )
+            CALL ZDSCAL( N, SAFMAX, X, INCX )
+         ELSE
+            CALL ZSCAL( N, DCMPLX( ZERO, -ONE / AI ), X, INCX )
+         END IF
+*
+      ELSE
+*        The following numbers can be computed.
+*        They are the inverse of the real and imaginary parts of 1/alpha.
+*        Note that a and b are always different from zero.
+*        NaNs are only possible if either:
+*        1. alphaR or alphaI is NaN.
+*        2. alphaR and alphaI are both infinite, in which case it makes sense
+*        to propagate a NaN.
+         UR = AR + AI * ( AI / AR )
+         UI = AI + AR * ( AR / AI )
+*
+         IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN
+*           This means that both alphaR and alphaI are very small.
+            CALL ZSCAL( N, DCMPLX( SAFMIN / UR, -SAFMIN / UI ), X,
+     $                  INCX )
+            CALL ZDSCAL( N, SAFMAX, X, INCX )
+         ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN
+            IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN
+*              This means that a and b are both Inf. No need for scaling.
+               CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX )
+            ELSE
+               CALL ZDSCAL( N, SAFMIN, X, INCX )
+               IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN
+*                 Infs were generated. We do proper scaling to avoid them.
+                  IF( ABSR.GE.ABSI ) THEN
+*                    ABS( UR ) <= ABS( UI )
+                     UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR ))
+                     UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI )
+                  ELSE
+*                    ABS( UR ) > ABS( UI )
+                     UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR )
+                     UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI ))
+                  END IF
+                  CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X,
+     $                        INCX )
+               ELSE
+                  CALL ZSCAL( N, DCMPLX( SAFMAX / UR, -SAFMAX / UI ),
+     $                        X, INCX )
+               END IF
+            END IF
+         ELSE
+            CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX )
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZRSCL
+*
+      END
--- a/lapack-netlib/SRC/zstemr.f
+++ b/lapack-netlib/SRC/zstemr.f
@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@ -330,6 +330,7 @@
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
 *> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
      SUBROUTINE ZSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@ -361,7 +362,8 @@
     $                     MINRGP = 1.0D-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
      INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
     $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
     $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@ -397,6 +399,7 @@
 *
      LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
      ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.

 *     DSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N.
@ -519,6 +522,15 @@
         ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
            CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
         END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
         IF( ALLEIG.OR.
     $      (VALEIG.AND.(R2.GT.WL).AND.
     $                  (R2.LE.WU)).OR.
@ -526,8 +538,13 @@
            M = M+1
            W( M ) = R2
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
                  Z( 1, M ) = -SN
                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN
@ -550,8 +567,13 @@
            M = M+1
            W( M ) = R1
            IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
                  Z( 1, M ) = CS
                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
               IF (SN.NE.ZERO) THEN
                  IF (CS.NE.ZERO) THEN