Merge pull request #3216 from xianyi/develop

Update from develop for 0.3.15 release
2021-05-02 23:48:28 +02:00 · 2021-05-02 23:48:28 +02:00 · 65502c6af6
parent 2f6d35c3d4 f71627fa2e
commit 65502c6af6
236 changed files with 12364 additions and 2380 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -224,12 +224,21 @@ matrix:
      before_script:
        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
        - brew update
-        - brew install gcc@10
      script:
        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
      env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
-        
+        - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
+
+    - <<: *test-macos
+      osx_image: xcode12
+      before_script:
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+        - brew update
+      script:
+        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+      env:
+        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"      
+
  #  - <<: *test-macos
  #    osx_image: xcode10
  #    env:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 14)
+set(OpenBLAS_PATCH_VERSION 14.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

 # Adhere to GNU filesystem layout conventions
--- a/Changelog.txt
+++ b/Changelog.txt
@ -1,4 +1,54 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.15
+  2-May-2021
+
+common:
+ - imported improvements and bugfixes from Reference-LAPACK 3.9.1
+ - imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537
+ - fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation
+ - fixed a sequence problem in the generation of softlinks to the library in GMAKE
+
+RISC V:
+ - fixed compilation on RISCV (missing entry in getarch)
+ - fixed a potential division by zero in CROTG and ZROTG
+
+POWER:
+ - fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler
+ - improved CGEMM, DGEMM and ZGEMM performance on POWER10
+ - added an optimized ZGEMV kernel for POWER10
+ - fixed a potential division by zero in CROTG and ZROTG
+
+x86_64:
+ - added support for Intel Control-flow Enforcement Technology (CET)
+ - reverted the DOMATCOPY_RT code to the generic C version
+ - fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14
+ - fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH
+ - added support for compilation of the benchmarks on older OSX versions
+ - fix propagation of the NO_AVX512 option in CMAKE builds
+ - fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows
+ - fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX)
+ - corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512
+
+ARM:
+ - fixed a potential division by zero in CROTG and ZROTG
+ - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
+
+ARM64:
+ - fixed spurious reads outside the array in the SGEMM tcopy macro
+ - fixed a potential division by zero in CROTG and ZROTG
+ - fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14)
+
+MIPS
+ - fixed a potential division by zero in CROTG and ZROTG
+ - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
+
+MIPS64:
+ - fixed a potential division by zero in CROTG and ZROTG
+
+SPARC:
+ - fixed a potential division by zero in CROTG and ZROTG
+
 ====================================================================
 Version 0.3.14
 17-Mar-2021
--- a/2
+++ b/2
@ -167,7 +167,6 @@ ifeq ($(NO_SHARED), 1)
 	$(error OpenBLAS: neither static nor shared are enabled.)
 endif
 endif
-	@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	@for d in $(SUBDIRS) ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d $(@F) || exit 1 ; \
@ -196,6 +195,7 @@ endif
 ifdef USE_THREAD
 	@echo USE_THREAD=$(USE_THREAD) >>  Makefile.conf_last
 endif
+	@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	@touch lib.grd

 prof : prof_blas prof_lapack
--- a/Makefile.rule
+++ b/Makefile.rule
@ -3,7 +3,7 @@
 #

 # This library's version
-VERSION = 0.3.14
+VERSION = 0.3.14.dev

 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
--- a/Makefile.x86
+++ b/Makefile.x86
@ -1,10 +1,21 @@
 # COMPILER_PREFIX = mingw32-

-ifdef HAVE_SSE
-CCOMMON_OPT += -msse
-FCOMMON_OPT += -msse
+ifndef DYNAMIC_ARCH
+ADD_CPUFLAGS = 1
+else
+ifdef TARGET_CORE
+ADD_CPUFLAGS = 1
+endif
 endif

+ifdef ADD_CPUFLAGS
+ifdef HAVE_SSE
+CCOMMON_OPT += -msse
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -msse
+endif
+endif
+endif

 ifeq ($(OSNAME), Interix)
 ARFLAGS		= -m x86
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@ -8,6 +8,16 @@ endif
 endif
 endif

+
+ifndef DYNAMIC_ARCH
+ADD_CPUFLAGS = 1
+else
+ifdef TARGET_CORE
+ADD_CPUFLAGS = 1
+endif
+endif
+
+ifdef ADD_CPUFLAGS
 ifdef HAVE_SSE3
 CCOMMON_OPT += -msse3
 ifneq ($(F_COMPILER), NAG)
@ -44,7 +54,6 @@ endif
 endif

 ifeq ($(CORE), SKYLAKEX)
-ifndef DYNAMIC_ARCH
 ifndef NO_AVX512
 CCOMMON_OPT += -march=skylake-avx512
 ifneq ($(F_COMPILER), NAG)
@ -62,10 +71,8 @@ endif
 endif
 endif
 endif
-endif

 ifeq ($(CORE), COOPERLAKE)
-ifndef DYNAMIC_ARCH
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
 # cooperlake support was added in 10.1
@ -88,7 +95,6 @@ endif
 endif
 endif
 endif
-endif

 ifdef HAVE_AVX2
 ifndef NO_AVX2
@ -120,6 +126,7 @@ endif
 endif
 endif

+endif


 ifeq ($(OSNAME), Interix)
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -4,7 +4,15 @@ trigger:
  branches:
    include:
      - develop
-
+resources:
+  containers:
+      - container: oneapi-hpckit
+        image: intel/oneapi-hpckit:latest
+        options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
+      - container: oneapi-basekit
+        image: intel/oneapi-basekit:latest
+        options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
+ 
 jobs:
 # manylinux1 is useful to test because the
 # standard Docker container uses an old version
@ -68,4 +76,64 @@ jobs:
      dir
      openblas_utest.exe
  
-      
+- job: OSX_OpenMP
+  pool:
+     vmImage: 'macOS-10.15'
+  steps:   
+  - script: |
+      brew update
+      make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
+     
+- job: OSX_GCC_Nothreads
+  pool:
+     vmImage: 'macOS-10.15'
+  steps:   
+  - script: |
+      brew update
+      make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
+     
+- job: OSX_OpenMP_Clang
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
+     LIBRARY_PATH: /usr/local/opt/llvm/lib
+  steps:   
+  - script: |
+      brew update
+      brew install llvm libomp
+      make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
+
+- job: OSX_Ifort_Clang
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
+     MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
+     LIBRARY_PATH: /usr/local/opt/llvm/lib
+     MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler
+  steps:   
+  - script: |
+      brew update
+      brew install llvm libomp
+      sudo mkdir -p /opt/intel
+      sudo chown $USER /opt/intel
+    displayName: prepare for cache restore
+  - task: Cache@2
+    inputs:
+      path: /opt/intel/oneapi
+      key: '"install" | "$(MACOS_HPCKIT_URL)" | "$(MACOS_FORTRAN_COMPONENTS)"'
+      cacheHitVar: CACHE_RESTORED
+  - script: | 
+      curl --output webimage.dmg --url $(MACOS_HPCKIT_URL) --retry 5 --retry-delay 5
+      hdiutil attach webimage.dmg
+      sudo /Volumes/"$(basename "$(MACOS_HPCKIT_URL)" .dmg)"/bootstrapper.app/Contents/MacOS/bootstrapper -s --action install --components="$(MACOS_FORTRAN_COMPONENTS)" --eula=accept --continue-with-optional-error=yes --log-dir=.
+      installer_exit_code=$?
+      hdiutil detach /Volumes/"$(basename "$URL" .dmg)" -quiet
+      exit $installer_exit_code
+    displayName: install
+    condition: ne(variables.CACHE_RESTORED, 'true')
+  - script: | 
+      source /opt/intel/oneapi/setvars.sh
+      make CC=/usr/local/opt/llvm/bin/clang FC=ifort
+ 
--- a/benchmark/bench.h
+++ b/benchmark/bench.h
@ -3,6 +3,8 @@
 #include <time.h>
 #ifdef __CYGWIN32__
 #include <sys/time.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
 #endif
 #include "common.h"

--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@ -66,7 +66,7 @@ set(SLASRC
   slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
   slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
   slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
-   slarf.f  slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
+   slarf.f  slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
   slarrv.f slartv.f
   slarz.f  slarzb.f slarzt.f slasy2.f
   slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
@ -112,14 +112,14 @@ set(SLASRC
   sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
   stpqrt.f stpqrt2.f stpmqrt.f stprfb.f
   sgelqt.f sgelqt3.f sgemlqt.f
-   sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
+   sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
   sgelq.f slaswlq.f slamswlq.f sgemlq.f
   stplqt.f stplqt2.f stpmlqt.f
   ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
   ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
   ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
   sgesvdq.f slaorhr_col_getrfnp.f
-   slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
+   slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )

 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
   sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@ -171,7 +171,7 @@ set(CLASRC
   claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
   claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
   claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
-   clarf.f  clarfb.f clarfg.f clarfgp.f clarft.f
+   clarf.f  clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
   clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
   clarz.f  clarzb.f clarzt.f clascl.f claset.f clasr.f  classq.f
   clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
@ -209,14 +209,14 @@ set(CLASRC
   cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
   ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f
   cgelqt.f cgelqt3.f cgemlqt.f
-   cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
+   cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
   cgelq.f claswlq.f clamswlq.f cgemlq.f
   ctplqt.f ctplqt2.f ctpmlqt.f
   chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
   cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
   chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
   cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
-   cungtsqr.f cunhr_col.f )
+   cungtsqr.f cungtsqr_row.f cunhr_col.f )

 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
   cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@ -253,7 +253,7 @@ set(DLASRC
   dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
   dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
   dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
-   dlarf.f  dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
+   dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
   dlargv.f dlarrv.f dlartv.f
   dlarz.f  dlarzb.f dlarzt.f dlasy2.f
   dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
@ -300,14 +300,14 @@ set(DLASRC
   dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
   dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f
   dgelqt.f dgelqt3.f dgemlqt.f
-   dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
+   dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
   dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
   dtplqt.f dtplqt2.f dtpmlqt.f
   dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
   dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
   dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
   dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
-   dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
+   dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )

 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
   dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@ -360,7 +360,7 @@ set(ZLASRC
   zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
   zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
   zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
-   zlarcm.f zlarf.f  zlarfb.f
+   zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f
   zlarfg.f zlarfgp.f zlarft.f
   zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
   zlarz.f  zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
@ -402,13 +402,13 @@ set(ZLASRC
   ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f
   ztplqt.f ztplqt2.f ztpmlqt.f
   zgelqt.f zgelqt3.f zgemlqt.f
-   zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
+   zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
   zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
   zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
   zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
   zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
   zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
-   zungtsqr.f zunhr_col.f)
+   zungtsqr.f zungtsqr_row.f zunhr_col.f)

 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
   zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@ -114,6 +114,8 @@ set(CSRC
  lapacke_cgetrs_work.c
  lapacke_cgetsls.c
  lapacke_cgetsls_work.c
+  lapacke_cgetsqrhrt.c
+  lapacke_cgetsqrhrt_work.c 
  lapacke_cggbak.c
  lapacke_cggbak_work.c
  lapacke_cggbal.c
@ -590,6 +592,8 @@ set(CSRC
  lapacke_cungrq_work.c
  lapacke_cungtr.c
  lapacke_cungtr_work.c
+  lapacke_cungtsqr_row.c
+  lapacke_cungtsqr_row_work.c
  lapacke_cunmbr.c
  lapacke_cunmbr_work.c
  lapacke_cunmhr.c
@ -735,6 +739,8 @@ set(DSRC
  lapacke_dgetrs_work.c
  lapacke_dgetsls.c
  lapacke_dgetsls_work.c
+  lapacke_dgetsqrhrt.c
+  lapacke_dgetsqrhrt_work.c
  lapacke_dggbak.c
  lapacke_dggbak_work.c
  lapacke_dggbal.c
@ -862,6 +868,8 @@ set(DSRC
  lapacke_dorgrq_work.c
  lapacke_dorgtr.c
  lapacke_dorgtr_work.c
+  lapacke_dorgtsqr_row.c
+  lapacke_dorgtsqr_row_work.c
  lapacke_dormbr.c
  lapacke_dormbr_work.c
  lapacke_dormhr.c
@ -1309,6 +1317,8 @@ set(SSRC
  lapacke_sgetrs_work.c
  lapacke_sgetsls.c
  lapacke_sgetsls_work.c
+  lapacke_sgetsqrhrt.c
+  lapacke_sgetsqrhrt_work.c
  lapacke_sggbak.c
  lapacke_sggbak_work.c
  lapacke_sggbal.c
@ -1435,6 +1445,8 @@ set(SSRC
  lapacke_sorgrq_work.c
  lapacke_sorgtr.c
  lapacke_sorgtr_work.c
+  lapacke_sorgtsqr_row.c
+  lapacke_sorgtsqr_row_work.c
  lapacke_sormbr.c
  lapacke_sormbr_work.c
  lapacke_sormhr.c
@ -1877,6 +1889,8 @@ set(ZSRC
  lapacke_zgetrs_work.c
  lapacke_zgetsls.c
  lapacke_zgetsls_work.c
+  lapacke_zgetsqrhrt.c
+  lapacke_zgetsqrhrt_work.c
  lapacke_zggbak.c
  lapacke_zggbak_work.c
  lapacke_zggbal.c
@ -2351,6 +2365,8 @@ set(ZSRC
  lapacke_zungrq_work.c
  lapacke_zungtr.c
  lapacke_zungtr_work.c
+  lapacke_zungtsqr_row.c
+  lapacke_zungtsqr_row_work.c
  lapacke_zunmbr.c
  lapacke_zunmbr_work.c
  lapacke_zunmhr.c
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@ -299,6 +299,10 @@ if (NO_AVX2)
  set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
 endif ()

+if (NO_AVX512)
+  set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
+endif ()
+
 if (USE_THREAD)
  # USE_SIMPLE_THREADED_LEVEL3 = 1
  # NO_AFFINITY = 1
--- a/common.h
+++ b/common.h
@ -416,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_alpha.h"
 #endif

+#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+#ifndef _CET_ENDBR
+#define _CET_ENDBR
+#endif
+
 #ifdef ARCH_X86
 #include "common_x86.h"
 #endif
--- a/common_x86.h
+++ b/common_x86.h
@ -340,7 +340,8 @@ REALNAME:
 	.align 16; \
 	.globl REALNAME ;\
       .type REALNAME, @function; \
-REALNAME:
+REALNAME: \
+	_CET_ENDBR

 #ifdef PROFILE
 #define PROFCODE call mcount
--- a/common_x86_64.h
+++ b/common_x86_64.h
@ -451,7 +451,8 @@ REALNAME:
 	.align 512; \
 	.globl REALNAME ;\
       .type REALNAME, @function; \
-REALNAME:
+REALNAME: \
+	_CET_ENDBR

 #ifdef PROFILE
 #define PROFCODE call *mcount@GOTPCREL(%rip)
--- a/ctest/c_cblas2.c
+++ b/ctest/c_cblas2.c
@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n,
  get_transpose_type(transp, &trans);
  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A  = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) );
+     A  = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ ){
           A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
  get_transpose_type(transp, &trans);
  if (*order == TEST_ROW_MJR) {
     LDA = *ku+*kl+2;
-     A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
     for( i=0; i<*ku; i++ ){
        irow=*ku+*kl-i;
        jcol=(*ku)-i;
@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ ){
           A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+     A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ ){
           A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ){
           A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
 		 *incx, beta, y, *incy );
     else {
        LDA = *k+2;
-        A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
        if (uplo == CblasUpper) {
           for( i=0; i<*k; i++ ){
              irow=*k-i;
@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
 	         beta, y, *incy);
     else {
        LDA = *n;
-        A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
-        AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
+        AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 	        sizeof( CBLAS_TEST_COMPLEX ));
        if (uplo == CblasUpper) {
           for( j=0, k=0; j<*n; j++ )
@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn,
 	x, *incx);
     else {
        LDA = *k+2;
-        A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
        if (uplo == CblasUpper) {
           for( i=0; i<*k; i++ ){
              irow=*k-i;
@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn,
 	         *incx);
     else {
        LDA = *k+2;
-        A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ));
+        A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ));
        if (uplo == CblasUpper) {
           for( i=0; i<*k; i++ ){
              irow=*k-i;
@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn,
        cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
     else {
        LDA = *n;
-        A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
-        AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 	 	sizeof(CBLAS_TEST_COMPLEX));
        if (uplo == CblasUpper) {
           for( j=0, k=0; j<*n; j++ )
@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn,
        cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
     else {
        LDA = *n;
-        A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
-        AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 		sizeof(CBLAS_TEST_COMPLEX));
     	if (uplo == CblasUpper) {
           for( j=0, k=0; j<*n; j++ )
@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA=*n+1;
-     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
     for( i=0; i<*n; i++ )
       for( j=0; j<*n; j++ ) {
 	  A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+     A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
           A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha,
        cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
     else {
        LDA = *n;
-        A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
-        AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+        AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 		sizeof( CBLAS_TEST_COMPLEX ));
        if (uplo == CblasUpper) {
           for( j=0, k=0; j<*n; j++ )
@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
 		     *incy, ap );
     else {
        LDA = *n;
-        A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
-        AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+        AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
 	sizeof( CBLAS_TEST_COMPLEX ));
        if (uplo == CblasUpper) {
           for( j=0, k=0; j<*n; j++ )
@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX ));
+     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX ));

     for( i=0; i<*n; i++ )
       for( j=0; j<*n; j++ ) {
@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+     A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );

     for( i=0; i<*n; i++ )
       for( j=0; j<*n; j++ ) {
--- a/ctest/c_dblas2.c
+++ b/ctest/c_dblas2.c
@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha,
  get_transpose_type(transp, &trans);
  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ )
           A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );

     for( i=0; i<*m; i++ ) {
       for( j=0; j<*n; j++ )
@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
     for( i=0; i<*n; i++ )
       for( j=0; j<*n; j++ )
         A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,

  if (*order == TEST_ROW_MJR) {
     LDA = *ku+*kl+2;
-     A   = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) );
     for( i=0; i<*ku; i++ ){
        irow=*ku+*kl-i;
        jcol=(*ku)-i;
@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *k+1;
-     A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
+     A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
     if (uplo == CblasUpper) {
        for( i=0; i<*k; i++ ){
           irow=*k-i;
@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *k+1;
-     A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
+     A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
     if (uplo == CblasUpper) {
        for( i=0; i<*k; i++ ){
        irow=*k-i;
@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *k+1;
-     A   = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
     if (uplo == CblasUpper) {
        for( i=0; i<*k; i++ ){
           irow=*k-i;
@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap,

  if (*order == TEST_ROW_MJR) {
     LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
     if (uplo == CblasUpper) {
        for( j=0, k=0; j<*n; j++ )
           for( i=0; i<j+1; i++, k++ )
@ -411,8 +411,8 @@ void F77_dtpmv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
     if (uplo == CblasUpper) {
        for( j=0, k=0; j<*n; j++ )
           for( i=0; i<j+1; i++, k++ )
@ -451,8 +451,8 @@ void F77_dtpsv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
     if (uplo == CblasUpper) {
        for( j=0, k=0; j<*n; j++ )
           for( i=0; i<j+1; i++, k++ )
@ -488,8 +488,8 @@ void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x,

  if (*order == TEST_ROW_MJR) {
     LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
     if (uplo == CblasUpper) {
        for( j=0, k=0; j<*n; j++ )
           for( i=0; i<j+1; i++, k++ )
@ -540,8 +540,8 @@ void F77_dspr2(int *order, char *uplow, int *n, double *alpha, double *x,

  if (*order == TEST_ROW_MJR) {
     LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
     if (uplo == CblasUpper) {
        for( j=0, k=0; j<*n; j++ )
           for( i=0; i<j+1; i++, k++ )
--- a/ctest/c_dblas3.c
+++ b/ctest/c_dblas3.c
@ -26,34 +26,34 @@ void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n,
  if (*order == TEST_ROW_MJR) {
     if (transa == CblasNoTrans) {
        LDA = *k+1;
-        A = (double *)malloc( (*m)*LDA*sizeof( double ) );
+        A = (double *)malloc( (*m)*(size_t)LDA*sizeof( double ) );
        for( i=0; i<*m; i++ )
           for( j=0; j<*k; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else {
        LDA = *m+1;
-        A   = ( double* )malloc( LDA*(*k)*sizeof( double ) );
+        A   = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*m; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     if (transb == CblasNoTrans) {
        LDB = *n+1;
-        B   = ( double* )malloc( (*k)*LDB*sizeof( double ) );
+        B   = ( double* )malloc( (*k)*(size_t)LDB*sizeof( double ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ )
              B[i*LDB+j]=b[j*(*ldb)+i];
     }
     else {
        LDB = *k+1;
-        B   = ( double* )malloc( LDB*(*n)*sizeof( double ) );
+        B   = ( double* )malloc( (size_t)LDB*(*n)*sizeof( double ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ )
              B[i*LDB+j]=b[j*(*ldb)+i];
     }
     LDC = *n+1;
-     C   = ( double* )malloc( (*m)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
     for( j=0; j<*n; j++ )
        for( i=0; i<*m; i++ )
           C[i*LDC+j]=c[j*(*ldc)+i];
@ -89,25 +89,25 @@ void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
  if (*order == TEST_ROW_MJR) {
     if (side == CblasLeft) {
        LDA = *m+1;
-        A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
        for( i=0; i<*m; i++ )
           for( j=0; j<*m; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else{
        LDA = *n+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*n; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     LDB = *n+1;
-     B   = ( double* )malloc( (*m)*LDB*sizeof( double ) );
+     B   = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ )
           B[i*LDB+j]=b[j*(*ldb)+i];
     LDC = *n+1;
-     C   = ( double* )malloc( (*m)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
     for( j=0; j<*n; j++ )
        for( i=0; i<*m; i++ )
           C[i*LDC+j]=c[j*(*ldc)+i];
@ -143,20 +143,20 @@ void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k,
  if (*order == TEST_ROW_MJR) {
     if (trans == CblasNoTrans) {
        LDA = *k+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else{
        LDA = *n+1;
-        A   = ( double* )malloc( (*k)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*k)*(size_t)LDA*sizeof( double ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     LDC = *n+1;
-     C   = ( double* )malloc( (*n)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           C[i*LDC+j]=c[j*(*ldc)+i];
@ -191,8 +191,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
     if (trans == CblasNoTrans) {
        LDA = *k+1;
        LDB = *k+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
-        B   = ( double* )malloc( (*n)*LDB*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
+        B   = ( double* )malloc( (*n)*(size_t)LDB*sizeof( double ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ ) {
              A[i*LDA+j]=a[j*(*lda)+i];
@ -202,8 +202,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
     else {
        LDA = *n+1;
        LDB = *n+1;
-        A   = ( double* )malloc( LDA*(*k)*sizeof( double ) );
-        B   = ( double* )malloc( LDB*(*k)*sizeof( double ) );
+        A   = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
+        B   = ( double* )malloc( (size_t)LDB*(*k)*sizeof( double ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ ){
              A[i*LDA+j]=a[j*(*lda)+i];
@ -211,7 +211,7 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
           }
     }
     LDC = *n+1;
-     C   = ( double* )malloc( (*n)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           C[i*LDC+j]=c[j*(*ldc)+i];
@ -249,20 +249,20 @@ void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
  if (*order == TEST_ROW_MJR) {
     if (side == CblasLeft) {
        LDA = *m+1;
-        A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
        for( i=0; i<*m; i++ )
           for( j=0; j<*m; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else{
        LDA = *n+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*n; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     LDB = *n+1;
-     B   = ( double* )malloc( (*m)*LDB*sizeof( double ) );
+     B   = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ )
           B[i*LDB+j]=b[j*(*ldb)+i];
@ -300,20 +300,20 @@ void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
  if (*order == TEST_ROW_MJR) {
     if (side == CblasLeft) {
        LDA = *m+1;
-        A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
        for( i=0; i<*m; i++ )
           for( j=0; j<*m; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else{
        LDA = *n+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*n; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     LDB = *n+1;
-     B   = ( double* )malloc( (*m)*LDB*sizeof( double ) );
+     B   = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ )
           B[i*LDB+j]=b[j*(*ldb)+i];
--- a/ctest/c_sblas2.c
+++ b/ctest/c_sblas2.c
@ -19,7 +19,7 @@ void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha,
  get_transpose_type(transp, &trans);
  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ )
           A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -43,7 +43,7 @@ void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );

     for( i=0; i<*m; i++ ) {
       for( j=0; j<*n; j++ )
@ -74,7 +74,7 @@ void F77_strmv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
     for( i=0; i<*n; i++ )
       for( j=0; j<*n; j++ )
         A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -102,7 +102,7 @@ void F77_strsv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -123,7 +123,7 @@ void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -146,7 +146,7 @@ void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -170,7 +170,7 @@ void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           A[ LDA*i+j ]=a[ (*lda)*j+i ];
@ -196,7 +196,7 @@ void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,

  if (*order == TEST_ROW_MJR) {
     LDA = *ku+*kl+2;
-     A   = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n+*kl)*(size_t)LDA*sizeof( float ) );
     for( i=0; i<*ku; i++ ){
        irow=*ku+*kl-i;
        jcol=(*ku)-i;
@ -236,7 +236,7 @@ void F77_stbmv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *k+1;
-     A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
+     A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
     if (uplo == CblasUpper) {
        for( i=0; i<*k; i++ ){
           irow=*k-i;
@ -282,7 +282,7 @@ void F77_stbsv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *k+1;
-     A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
+     A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
     if (uplo == CblasUpper) {
        for( i=0; i<*k; i++ ){
        irow=*k-i;
@ -325,7 +325,7 @@ void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *k+1;
-     A   = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
     if (uplo == CblasUpper) {
        for( i=0; i<*k; i++ ){
           irow=*k-i;
@ -369,8 +369,8 @@ void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap,

  if (*order == TEST_ROW_MJR) {
     LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
     if (uplo == CblasUpper) {
        for( j=0, k=0; j<*n; j++ )
           for( i=0; i<j+1; i++, k++ )
@ -410,8 +410,8 @@ void F77_stpmv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
     if (uplo == CblasUpper) {
        for( j=0, k=0; j<*n; j++ )
           for( i=0; i<j+1; i++, k++ )
@ -449,8 +449,8 @@ void F77_stpsv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
     if (uplo == CblasUpper) {
        for( j=0, k=0; j<*n; j++ )
           for( i=0; i<j+1; i++, k++ )
@ -485,8 +485,8 @@ void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x,

  if (*order == TEST_ROW_MJR) {
     LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
     if (uplo == CblasUpper) {
        for( j=0, k=0; j<*n; j++ )
           for( i=0; i<j+1; i++, k++ )
@ -536,8 +536,8 @@ void F77_sspr2(int *order, char *uplow, int *n, float *alpha, float *x,

  if (*order == TEST_ROW_MJR) {
     LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
     if (uplo == CblasUpper) {
        for( j=0, k=0; j<*n; j++ )
           for( i=0; i<j+1; i++, k++ )
--- a/ctest/c_sblas3.c
+++ b/ctest/c_sblas3.c
@ -23,34 +23,34 @@ void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n,
  if (*order == TEST_ROW_MJR) {
     if (transa == CblasNoTrans) {
        LDA = *k+1;
-        A = (float *)malloc( (*m)*LDA*sizeof( float ) );
+        A = (float *)malloc( (*m)*(size_t)LDA*sizeof( float ) );
        for( i=0; i<*m; i++ )
           for( j=0; j<*k; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else {
        LDA = *m+1;
-        A   = ( float* )malloc( LDA*(*k)*sizeof( float ) );
+        A   = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*m; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     if (transb == CblasNoTrans) {
        LDB = *n+1;
-        B   = ( float* )malloc( (*k)*LDB*sizeof( float ) );
+        B   = ( float* )malloc( (*k)*(size_t)LDB*sizeof( float ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ )
              B[i*LDB+j]=b[j*(*ldb)+i];
     }
     else {
        LDB = *k+1;
-        B   = ( float* )malloc( LDB*(*n)*sizeof( float ) );
+        B   = ( float* )malloc( (size_t)LDB*(*n)*sizeof( float ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ )
              B[i*LDB+j]=b[j*(*ldb)+i];
     }
     LDC = *n+1;
-     C   = ( float* )malloc( (*m)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
     for( j=0; j<*n; j++ )
        for( i=0; i<*m; i++ )
           C[i*LDC+j]=c[j*(*ldc)+i];
@ -85,25 +85,25 @@ void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n,
  if (*order == TEST_ROW_MJR) {
     if (side == CblasLeft) {
        LDA = *m+1;
-        A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
        for( i=0; i<*m; i++ )
           for( j=0; j<*m; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else{
        LDA = *n+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*n; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     LDB = *n+1;
-     B   = ( float* )malloc( (*m)*LDB*sizeof( float ) );
+     B   = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ )
           B[i*LDB+j]=b[j*(*ldb)+i];
     LDC = *n+1;
-     C   = ( float* )malloc( (*m)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
     for( j=0; j<*n; j++ )
        for( i=0; i<*m; i++ )
           C[i*LDC+j]=c[j*(*ldc)+i];
@ -139,20 +139,20 @@ void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k,
  if (*order == TEST_ROW_MJR) {
     if (trans == CblasNoTrans) {
        LDA = *k+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else{
        LDA = *n+1;
-        A   = ( float* )malloc( (*k)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*k)*(size_t)LDA*sizeof( float ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     LDC = *n+1;
-     C   = ( float* )malloc( (*n)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           C[i*LDC+j]=c[j*(*ldc)+i];
@ -187,8 +187,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
     if (trans == CblasNoTrans) {
        LDA = *k+1;
        LDB = *k+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
-        B   = ( float* )malloc( (*n)*LDB*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
+        B   = ( float* )malloc( (*n)*(size_t)LDB*sizeof( float ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ ) {
              A[i*LDA+j]=a[j*(*lda)+i];
@ -198,8 +198,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
     else {
        LDA = *n+1;
        LDB = *n+1;
-        A   = ( float* )malloc( LDA*(*k)*sizeof( float ) );
-        B   = ( float* )malloc( LDB*(*k)*sizeof( float ) );
+        A   = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
+        B   = ( float* )malloc( (size_t)LDB*(*k)*sizeof( float ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ ){
              A[i*LDA+j]=a[j*(*lda)+i];
@ -207,7 +207,7 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
           }
     }
     LDC = *n+1;
-     C   = ( float* )malloc( (*n)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
           C[i*LDC+j]=c[j*(*ldc)+i];
@ -245,20 +245,20 @@ void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
  if (*order == TEST_ROW_MJR) {
     if (side == CblasLeft) {
        LDA = *m+1;
-        A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
        for( i=0; i<*m; i++ )
           for( j=0; j<*m; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else{
        LDA = *n+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*n; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     LDB = *n+1;
-     B   = ( float* )malloc( (*m)*LDB*sizeof( float ) );
+     B   = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ )
           B[i*LDB+j]=b[j*(*ldb)+i];
@ -296,20 +296,20 @@ void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
  if (*order == TEST_ROW_MJR) {
     if (side == CblasLeft) {
        LDA = *m+1;
-        A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
        for( i=0; i<*m; i++ )
           for( j=0; j<*m; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else{
        LDA = *n+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*n; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     LDB = *n+1;
-     B   = ( float* )malloc( (*m)*LDB*sizeof( float ) );
+     B   = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ )
           B[i*LDB+j]=b[j*(*ldb)+i];
--- a/ctest/c_zblas2.c
+++ b/ctest/c_zblas2.c
@ -20,7 +20,7 @@ void F77_zgemv(int *order, char *transp, int *m, int *n,
  get_transpose_type(transp, &trans);
  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A  = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
+     A  = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ ){
           A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -50,7 +50,7 @@ void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
  get_transpose_type(transp, &trans);
  if (*order == TEST_ROW_MJR) {
     LDA = *ku+*kl+2;
-     A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
     for( i=0; i<*ku; i++ ){
        irow=*ku+*kl-i;
        jcol=(*ku)-i;
@ -94,7 +94,7 @@ void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ ){
           A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -122,7 +122,7 @@ void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ ){
           A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -154,7 +154,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ){
           A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
 		 *incx, beta, y, *incy );
     else {
        LDA = *k+2;
-        A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
        if (uplo == CblasUpper) {
           for( i=0; i<*k; i++ ){
              irow=*k-i;
@ -251,8 +251,8 @@ void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 	         beta, y, *incy);
     else {
        LDA = *n;
-        A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
-        AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
+        AP = (CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 	        sizeof( CBLAS_TEST_ZOMPLEX ));
        if (uplo == CblasUpper) {
           for( j=0, k=0; j<*n; j++ )
@ -311,7 +311,7 @@ void F77_ztbmv(int *order, char *uplow, char *transp, char *diagn,
 	x, *incx);
     else {
        LDA = *k+2;
-        A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
        if (uplo == CblasUpper) {
           for( i=0; i<*k; i++ ){
              irow=*k-i;
@ -375,7 +375,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn,
 	         *incx);
     else {
        LDA = *k+2;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
        if (uplo == CblasUpper) {
           for( i=0; i<*k; i++ ){
              irow=*k-i;
@ -436,8 +436,8 @@ void F77_ztpmv(int *order, char *uplow, char *transp, char *diagn,
        cblas_ztpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
     else {
        LDA = *n;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
-        AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 	 	sizeof(CBLAS_TEST_ZOMPLEX));
        if (uplo == CblasUpper) {
           for( j=0, k=0; j<*n; j++ )
@ -491,8 +491,8 @@ void F77_ztpsv(int *order, char *uplow, char *transp, char *diagn,
        cblas_ztpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
     else {
        LDA = *n;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
-        AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 		sizeof(CBLAS_TEST_ZOMPLEX));
     	if (uplo == CblasUpper) {
           for( j=0, k=0; j<*n; j++ )
@ -544,7 +544,7 @@ void F77_ztrmv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA=*n+1;
-     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
     for( i=0; i<*n; i++ )
       for( j=0; j<*n; j++ ) {
 	  A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -573,7 +573,7 @@ void F77_ztrsv(int *order, char *uplow, char *transp, char *diagn,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
           A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@ -601,8 +601,8 @@ void F77_zhpr(int *order, char *uplow, int *n, double *alpha,
        cblas_zhpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
     else {
        LDA = *n;
-        A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
-        AP = ( CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        AP = ( CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 		sizeof( CBLAS_TEST_ZOMPLEX ));
        if (uplo == CblasUpper) {
           for( j=0, k=0; j<*n; j++ )
@ -678,8 +678,8 @@ void F77_zhpr2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 		     *incy, ap );
     else {
        LDA = *n;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
-        AP=(CBLAS_TEST_ZOMPLEX*)malloc( (((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_ZOMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        AP=(CBLAS_TEST_ZOMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
 	sizeof( CBLAS_TEST_ZOMPLEX ));
        if (uplo == CblasUpper) {
           for( j=0, k=0; j<*n; j++ )
@ -750,7 +750,7 @@ void F77_zher(int *order, char *uplow, int *n, double *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
+     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX ));

     for( i=0; i<*n; i++ )
       for( j=0; j<*n; j++ ) {
@ -784,7 +784,7 @@ void F77_zher2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,

  if (*order == TEST_ROW_MJR) {
     LDA = *n+1;
-     A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );

     for( i=0; i<*n; i++ )
       for( j=0; j<*n; j++ ) {
--- a/ctest/c_zblas3.c
+++ b/ctest/c_zblas3.c
@ -26,7 +26,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
  if (*order == TEST_ROW_MJR) {
     if (transa == CblasNoTrans) {
        LDA = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*m; i++ )
           for( j=0; j<*k; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -35,7 +35,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
     }
     else {
        LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*k; i++ )
           for( j=0; j<*m; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -45,7 +45,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,

     if (transb == CblasNoTrans) {
        LDB = *n+1;
-        B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ ) {
              B[i*LDB+j].real=b[j*(*ldb)+i].real;
@ -54,7 +54,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
     }
     else {
        LDB = *k+1;
-        B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ ) {
              B[i*LDB+j].real=b[j*(*ldb)+i].real;
@ -63,7 +63,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
     }

     LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
     for( j=0; j<*n; j++ )
        for( i=0; i<*m; i++ ) {
           C[i*LDC+j].real=c[j*(*ldc)+i].real;
@ -103,7 +103,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
  if (*order == TEST_ROW_MJR) {
     if (side == CblasLeft) {
        LDA = *m+1;
-        A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*m; i++ )
           for( j=0; j<*m; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -112,7 +112,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
     }
     else{
        LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*n; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -120,14 +120,14 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
           }
     }
     LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ ) {
           B[i*LDB+j].real=b[j*(*ldb)+i].real;
           B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
        }
     LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
     for( j=0; j<*n; j++ )
        for( i=0; i<*m; i++ ) {
           C[i*LDC+j].real=c[j*(*ldc)+i].real;
@ -167,25 +167,25 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
  if (*order == TEST_ROW_MJR) {
     if (side == CblasLeft) {
        LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*m; i++ )
           for( j=0; j<*m; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     else{
        LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*n; j++ )
              A[i*LDA+j]=a[j*(*lda)+i];
     }
     LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
+     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ )
           B[i*LDB+j]=b[j*(*ldb)+i];
     LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
     for( j=0; j<*n; j++ )
        for( i=0; i<*m; i++ )
           C[i*LDC+j]=c[j*(*ldc)+i];
@ -221,7 +221,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
  if (*order == TEST_ROW_MJR) {
     if (trans == CblasNoTrans) {
        LDA = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -230,7 +230,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
     }
     else{
        LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -238,7 +238,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
           }
     }
     LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
           C[i*LDC+j].real=c[j*(*ldc)+i].real;
@ -277,7 +277,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
  if (*order == TEST_ROW_MJR) {
     if (trans == CblasNoTrans) {
        LDA = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -286,7 +286,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
     }
     else{
        LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -294,7 +294,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
           }
     }
     LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
           C[i*LDC+j].real=c[j*(*ldc)+i].real;
@ -333,8 +333,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
     if (trans == CblasNoTrans) {
        LDA = *k+1;
        LDB = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
-        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -346,8 +346,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
     else {
        LDA = *n+1;
        LDB = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
-        B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        B=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ ){
 	      A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -357,7 +357,7 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
           }
     }
     LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
           C[i*LDC+j].real=c[j*(*ldc)+i].real;
@ -397,8 +397,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
     if (trans == CblasNoTrans) {
        LDA = *k+1;
        LDB = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
-        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*n; i++ )
           for( j=0; j<*k; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -410,8 +410,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
     else {
        LDA = *n+1;
        LDB = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
-        B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*k; i++ )
           for( j=0; j<*n; j++ ){
 	      A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -421,7 +421,7 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
           }
     }
     LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
+     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
     for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
           C[i*LDC+j].real=c[j*(*ldc)+i].real;
@ -463,7 +463,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
  if (*order == TEST_ROW_MJR) {
     if (side == CblasLeft) {
        LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*m; i++ )
           for( j=0; j<*m; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -472,7 +472,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
     }
     else{
        LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*n; i++ )
           for( j=0; j<*n; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -480,7 +480,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
           }
     }
     LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
+     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ ) {
           B[i*LDB+j].real=b[j*(*ldb)+i].real;
@ -522,7 +522,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
  if (*order == TEST_ROW_MJR) {
     if (side == CblasLeft) {
        LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
        for( i=0; i<*m; i++ )
           for( j=0; j<*m; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -531,7 +531,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
     }
     else{
        LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
        for( i=0; i<*n; i++ )
           for( j=0; j<*n; j++ ) {
              A[i*LDA+j].real=a[j*(*lda)+i].real;
@ -539,7 +539,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
           }
     }
     LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
+     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
     for( i=0; i<*m; i++ )
        for( j=0; j<*n; j++ ) {
           B[i*LDB+j].real=b[j*(*ldb)+i].real;
--- a/ctest/constant.c
+++ b/ctest/constant.c
@ -1,3 +1,4 @@
+#include "cblas_test.h"
 int CBLAS_CallFromC;
 int RowMajorStrg;

--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@ -126,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg);
 #endif

 #define get_cpu_ftr(id, var) ({					\
-		__asm__ ("mrs %0, "#id : "=r" (var));		\
+		__asm__ __volatile__ ("mrs %0, "#id : "=r" (var));		\
 	})

 static char *corename[] = {
--- a/exports/Makefile
+++ b/exports/Makefile
@ -139,9 +139,13 @@ endif
 ifneq (,$(filter 1 2,$(NOFORTRAN)))
 #only build without Fortran
 	$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
+else
+ifeq ($(F_COMPILER), INTEL)
+	$(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def
 else
 	$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
 endif
+endif

 dllinit.$(SUFFIX) : dllinit.c
 	$(CC) $(CFLAGS) -c -o $(@F) -s $<
--- a/4
+++ b/4
@ -391,10 +391,6 @@ if ($link ne "") {

 }

-if ($vendor eq "INTEL"){
-    $linker_a .= "-lgfortran"
-}
-
 if ($vendor eq "FLANG"){
    $linker_a .= "-lflang"
 }
--- a/getarch_2nd.c
+++ b/getarch_2nd.c
@ -4,7 +4,15 @@
 #else
 #include "config_kernel.h"
 #endif
-#include "common.h"
+#if (defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) || defined(_WIN32) || defined(_WIN64)) && defined(__64BIT__)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#include "param.h"

 int main(int argc, char **argv) {

--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@ -150,9 +150,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 #endif

 	if ( *lda >  *ldb )
-		msize = (*lda) * (*ldb)  * sizeof(FLOAT);
+		msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT);
 	else
-		msize = (*ldb) * (*ldb)  * sizeof(FLOAT);
+		msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT);

 	b = malloc(msize);
 	if ( b == NULL )
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@ -172,9 +172,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 #endif

 	if ( *lda >  *ldb )
-                msize = (*lda) * (*ldb)  * sizeof(FLOAT) * 2;
+                msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT) * 2;
        else
-                msize = (*ldb) * (*ldb)  * sizeof(FLOAT) * 2;
+                msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT) * 2;

        b = malloc(msize);
        if ( b == NULL )
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@ -79,8 +79,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
      aa_i = fabs(da_r);
    }

-    scale = (aa_i / aa_r);
-    ada = aa_r * sqrt(ONE + scale * scale);
+    if (aa_r == ZERO) {
+	ada = 0.;
+    } else {
+        scale = (aa_i / aa_r);
+        ada = aa_r * sqrt(ONE + scale * scale);
+    }

    bb_r = fabs(db_r);
    bb_i = fabs(db_i);
@ -90,9 +94,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
      bb_i = fabs(bb_r);
    }

-    scale = (bb_i / bb_r);
-    adb = bb_r * sqrt(ONE + scale * scale);
-
+    if (bb_r == ZERO) {
+	adb = 0.;
+    } else {
+    	scale = (bb_i / bb_r);
+    	adb = bb_r * sqrt(ONE + scale * scale);
+    }
    scale = ada + adb;

    aa_r    = da_r / scale;
--- a/kernel/arm64/sgemm_tcopy_16.S
+++ b/kernel/arm64/sgemm_tcopy_16.S
@ -270,11 +270,6 @@ All rights reserved.
 	ldr	s1, [A02]
 	ldr	s2, [A03]
 	ldr	s3, [A04]
-	
-	add	A01, A01, #4
-	add	A02, A02, #4
-	add	A03, A03, #4
-	add	A04, A04, #4

 	stp	s0, s1, [B04]
 	add	B04, B04, #8
@ -285,11 +280,6 @@ All rights reserved.
 	ldr	s5, [A06]
 	ldr	s6, [A07]
 	ldr	s7, [A08]
-	
-	ldr	d4, [A05], #8
-	ldr	d5, [A06], #8
-	ldr	d6, [A07], #8
-	ldr	d7, [A08], #8

 	stp	s4, s5, [B04]
 	add	B04, B04, #8
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@ -169,8 +169,13 @@ ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c
 DSCALKERNEL  = dscal.c
+ifeq ($(C_COMPILER), PGI)
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+else
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c
+endif
 #
 SSWAPKERNEL  = sswap.c
 DSWAPKERNEL  = dswap.c
@ -181,7 +186,7 @@ ZSWAPKERNEL  = zswap.c
 SGEMVNKERNEL = sgemv_n.c
 DGEMVNKERNEL = dgemv_n_power10.c
 CGEMVNKERNEL = cgemv_n.c
-ZGEMVNKERNEL = zgemv_n_4.c
+ZGEMVNKERNEL =  zgemv_n_power10.c
 #
 SGEMVTKERNEL = sgemv_t.c
 DGEMVTKERNEL = dgemv_t_power10.c
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@ -242,8 +242,13 @@ ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c
 DSCALKERNEL  = dscal.c
+ifeq ($(C_COMPILER), PGI)
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+else
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c
+endif
 #
 SSWAPKERNEL  = sswap.c
 DSWAPKERNEL  = dswap.c
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@ -166,8 +166,13 @@ ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c
 DSCALKERNEL  = dscal.c
+ifeq ($(C_COMPILER), PGI)
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+else
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c
+endif
 #
 SSWAPKERNEL  = sswap.c
 DSWAPKERNEL  = dswap.c
--- a/kernel/power/cdot.c
+++ b/kernel/power/cdot.c
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else

 #include "common.h"
-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "cdot_microk_power10.c"
 #else
 #ifndef HAVE_KERNEL_8
@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA

    if ((inc_x == 1) && (inc_y == 1)) {

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
        BLASLONG n1 = n & -16;
 #else
        BLASLONG n1 = n & -8;
--- a/kernel/power/cswap.c
+++ b/kernel/power/cswap.c
@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8)  || defined(POWER9)
 #include "cswap_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "cswap_microk_power10.c"
+#elif defined(POWER10)
+#include "cswap_microk_power8.c"
 #endif
 #endif

--- a/kernel/power/dasum.c
+++ b/kernel/power/dasum.c
@ -49,8 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "dasum_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "dasum_microk_power10.c"
+#elif defined(POWER10)
+#include "dasum_microk_power8.c"
 #endif
 #endif

@ -112,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if ( inc_x == 1 )
 	{

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 16 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
--- a/kernel/power/dgemm_kernel_power10.c
+++ b/kernel/power/dgemm_kernel_power10.c
@ -190,10 +190,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
-	  vec_t *rb = (vec_t *) & BO[0];
 	  __vector_pair rowB, rowB1;
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
+	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@ -205,9 +204,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 3];
-	      rb = (vec_t *) & BO[l << 3];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
+	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@ -247,9 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB, rowB1;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
+	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@ -257,9 +254,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 2];
-	      rb = (vec_t *) & BO[l << 3];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
+	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@ -291,17 +287,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB, rowB1;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
+	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 1];
-	      rb = (vec_t *) & BO[l << 3];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
+	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
 	    }
@ -403,8 +397,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
@ -412,8 +405,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 3];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@ -445,15 +437,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 2];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
 	    }
@ -481,14 +471,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 1];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	    }
 	  SAVE_ACC (&acc0, 0);
--- a/kernel/power/drot.c
+++ b/kernel/power/drot.c
@ -42,8 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "drot_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "drot_microk_power10.c"
+#elif defined(POWER10)
+#include "drot_microk_power8.c"
 #endif
 #endif

@ -117,7 +119,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 16 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
--- a/kernel/power/dscal.c
+++ b/kernel/power/dscal.c
@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "dscal_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "dscal_microk_power10.c"
+#elif defined(POWER10)
+#include "dscal_microk_power8.c"
 #endif
 #endif

@ -102,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		if ( da == 0.0 )
 		{		

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 			if ( n >= 16 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
@ -136,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		else
 		{

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 			if ( n >= 16 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
--- a/kernel/power/dswap.c
+++ b/kernel/power/dswap.c
@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "dswap_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "swap_microk_power10.c"
+#elif defined(POWER10)
+#include "dswap_microk_power8.c"
 #endif
 #endif

@ -117,7 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 32 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
--- a/kernel/power/sasum.c
+++ b/kernel/power/sasum.c
@ -49,8 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "sasum_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "sasum_microk_power10.c"
+#elif defined(POWER10)
+#include "sasum_microk_power8.c"
 #endif
 #endif

@ -112,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if ( inc_x == 1 )
 	{

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 32 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
--- a/kernel/power/srot.c
+++ b/kernel/power/srot.c
@ -42,8 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "srot_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "srot_microk_power10.c"
+#elif defined(POWER10)
+#include "srot_microk_power8.c"
 #endif
 #endif

@ -117,7 +119,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 16 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
--- a/kernel/power/sscal.c
+++ b/kernel/power/sscal.c
@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "sscal_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "sscal_microk_power10.c"
+#elif defined(POWER10)
+#include "sscal_microk_power8.c"
 #endif
 #endif

@ -104,7 +106,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		if ( da == 0.0 )
 		{		

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 			if ( n >= 32 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
@ -138,7 +140,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		else
 		{

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 			if ( n >= 32 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
--- a/kernel/power/sswap.c
+++ b/kernel/power/sswap.c
@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "sswap_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "swap_microk_power10.c"
+#elif defined(POWER10)
+#include "sswap_microk_power8.c"
 #endif
 #endif

@ -117,7 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{

-#if defined(POWER10)
+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 		if ( n >= 64 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
--- a/kernel/power/zgemv_n_power10.c
+++ b/kernel/power/zgemv_n_power10.c
--- a/kernel/power/zgemv_t_4.c
+++ b/kernel/power/zgemv_t_4.c
@ -43,6 +43,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #elif HAVE_KERNEL_4x4_VEC

+#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+
+
+static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector_quad acc0, acc1, acc2, acc3;;
+    __vector_quad acc4, acc5, acc6, acc7;
+    v4sf_t result[4];
+    __vector_pair *Va0, *Va1, *Va2, *Va3;
+    i = 0;
+    n = n << 1;
+    __builtin_mma_xxsetaccz (&acc0);
+    __builtin_mma_xxsetaccz (&acc1);
+    __builtin_mma_xxsetaccz (&acc2);
+    __builtin_mma_xxsetaccz (&acc3);
+    __builtin_mma_xxsetaccz (&acc4);
+    __builtin_mma_xxsetaccz (&acc5);
+    __builtin_mma_xxsetaccz (&acc6);
+    __builtin_mma_xxsetaccz (&acc7);
+    while (i < n) {
+
+	vec_t *rx = (vec_t *) & x[i];
+        Va0  = ((__vector_pair*)((void*)&a0[i]));
+        Va1  = ((__vector_pair*)((void*)&a1[i]));
+        Va2  = ((__vector_pair*)((void*)&a2[i]));
+        Va3  = ((__vector_pair*)((void*)&a3[i]));
+
+        __builtin_mma_xvf64gerpp (&acc0, Va0[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc1, Va1[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc2, Va2[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc3, Va3[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc4, Va0[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc5, Va1[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc6, Va2[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc7, Va3[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc0, Va0[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc1, Va1[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc2, Va2[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc3, Va3[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc4, Va0[1], rx[3]);
+        __builtin_mma_xvf64gerpp (&acc5, Va1[1], rx[3]);
+        __builtin_mma_xvf64gerpp (&acc6, Va2[1], rx[3]);
+        __builtin_mma_xvf64gerpp (&acc7, Va3[1], rx[3]);
+        i += 8;
+
+    }
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    __builtin_mma_disassemble_acc ((void *)result, &acc0);
+    register FLOAT temp_r0 = result[0][0] - result[1][1];
+    register FLOAT temp_i0 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc4);
+    temp_r0 += result[2][0] - result[3][1];
+    temp_i0 += result[2][1] + result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc1);
+    register FLOAT temp_r1 = result[0][0] - result[1][1];
+    register FLOAT temp_i1 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc5);
+    temp_r1 += result[2][0] - result[3][1];
+    temp_i1 += result[2][1] + result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc2);
+    register FLOAT temp_r2 = result[0][0] - result[1][1];
+    register FLOAT temp_i2 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc6);
+    temp_r2 += result[2][0] - result[3][1];
+    temp_i2 += result[2][1] + result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc3);
+    register FLOAT temp_r3 = result[0][0] - result[1][1];
+    register FLOAT temp_i3 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc7);
+    temp_r3 += result[2][0] - result[3][1];
+    temp_i3 += result[2][1] + result[3][0];
+#else
+    __builtin_mma_disassemble_acc ((void *)result, &acc0);
+    register FLOAT temp_r0 = result[0][0] + result[1][1];
+    register FLOAT temp_i0 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc4);
+    temp_r0 += result[2][0] + result[3][1];
+    temp_i0 += result[2][1] - result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc1);
+    register FLOAT temp_r1 = result[0][0] + result[1][1];
+    register FLOAT temp_i1 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc5);
+    temp_r1 += result[2][0] + result[3][1];
+    temp_i1 += result[2][1] - result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc2);
+    register FLOAT temp_r2 = result[0][0] + result[1][1];
+    register FLOAT temp_i2 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc6);
+    temp_r2 += result[2][0] + result[3][1];
+    temp_i2 += result[2][1] - result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc3);
+    register FLOAT temp_r3 = result[0][0] + result[1][1];
+    register FLOAT temp_i3 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc7);
+    temp_r3 += result[2][0] + result[3][1];
+    temp_i3 += result[2][1] - result[3][0];
+#endif
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
+    y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
+    y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
+    y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
+    y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
+#endif
+}
+#else
 static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
    BLASLONG i;
    FLOAT *a0, *a1, *a2, *a3;
@ -198,6 +326,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
 #endif
 }

+#endif
 #else

 static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
--- a/kernel/power/zscal.c
+++ b/kernel/power/zscal.c
@ -43,12 +43,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(DOUBLE)
 #include "zscal_microk_power8.c"
 #endif
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #if defined(DOUBLE)
 #include "zscal_microk_power10.c"
 #else
 #include "cscal_microk_power10.c"
 #endif
+#elif defined(POWER10)
+#if defined(DOUBLE)
+#include "zscal_microk_power8.c"
+#endif
 #endif
 #endif

--- a/kernel/power/zswap.c
+++ b/kernel/power/zswap.c
@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "zswap_microk_power8.c"
-#elif defined(POWER10)
+#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 #include "cswap_microk_power10.c"
+#elif defined(POWER10)
+#include "zswap_microk_power8.c"
 #endif
 #endif

--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@ -491,4 +491,3 @@ SSUMKERNEL = ../arm/sum.c
 DSUMKERNEL = ../arm/sum.c

 SOMATCOPY_RT = omatcopy_rt.c
-DOMATCOPY_RT = omatcopy_rt.c
--- a/kernel/x86_64/drot_microk_haswell-2.c
+++ b/kernel/x86_64/drot_microk_haswell-2.c
@ -1,6 +1,4 @@
-/* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
-
+#if defined(HAVE_FMA3)  && defined(HAVE_AVX2)
 #define HAVE_DROT_KERNEL 1

 #include <immintrin.h>
--- a/kernel/x86_64/sgemm_direct_skylakex.c
+++ b/kernel/x86_64/sgemm_direct_skylakex.c
@ -1,10 +1,10 @@
 /* the direct sgemm code written by Arjan van der Ven */
-
+#include "common.h"

 #if defined(SKYLAKEX) || defined (COOPERLAKE)

 #include <immintrin.h>
-#include "common.h"
+

 /*
 * "Direct sgemm" code. This code operates directly on the inputs and outputs
@ -472,7 +472,7 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG s
 	}
 }
 #else
-#include "common.h"
+
 void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
 {}
 #endif
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
@ -501,7 +501,11 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
    int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3};
    BLASLONG n_count = n;
    float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
+#if defined(__clang__)
+    for(;n_count>23;n_count-=24) COMPUTE(24)
+#else
    for(;n_count>23;n_count-=24) COMPUTE_n24
+#endif    
    for(;n_count>19;n_count-=20) COMPUTE(20)
    for(;n_count>15;n_count-=16) COMPUTE(16)
    for(;n_count>11;n_count-=12) COMPUTE(12)
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@ -566,8 +566,8 @@ void LAPACK_cgbrfsx(
    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
    lapack_complex_float const* AB, lapack_int const* ldab,
    lapack_complex_float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    float* R,
-    float* C,
+    const float* R,
+    const float* C,
    lapack_complex_float const* B, lapack_int const* ldb,
    lapack_complex_float* X, lapack_int const* ldx,
    float* rcond,
@ -585,8 +585,8 @@ void LAPACK_dgbrfsx(
    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
    double const* AB, lapack_int const* ldab,
    double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    double* R,
-    double* C,
+    const double* R,
+    const double* C,
    double const* B, lapack_int const* ldb,
    double* X, lapack_int const* ldx,
    double* rcond,
@ -604,8 +604,8 @@ void LAPACK_sgbrfsx(
    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
    float const* AB, lapack_int const* ldab,
    float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    float* R,
-    float* C,
+    const float* R,
+    const float* C,
    float const* B, lapack_int const* ldb,
    float* X, lapack_int const* ldx,
    float* rcond,
@ -623,8 +623,8 @@ void LAPACK_zgbrfsx(
    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
    lapack_complex_double const* AB, lapack_int const* ldab,
    lapack_complex_double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    double* R,
-    double* C,
+    const double* R,
+    const double* C,
    lapack_complex_double const* B, lapack_int const* ldb,
    lapack_complex_double* X, lapack_int const* ldx,
    double* rcond,
@ -2941,6 +2941,42 @@ void LAPACK_zgetsls(
    lapack_complex_double* work, lapack_int const* lwork,
    lapack_int* info );

+#define LAPACK_cgetsqrhrt LAPACK_GLOBAL(cgetsqrhrt,CGETSQRHRT)
+void LAPACK_cgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgetsqrhrt LAPACK_GLOBAL(dgetsqrhrt,DGETSQRHRT)
+void LAPACK_dgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    double* A, lapack_int const* lda,
+    double* T, lapack_int const* ldt,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgetsqrhrt LAPACK_GLOBAL(sgetsqrhrt,SGETSQRHRT)
+void LAPACK_sgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    float* A, lapack_int const* lda,
+    float* T, lapack_int const* ldt,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgetsqrhrt LAPACK_GLOBAL(zgetsqrhrt,ZGETSQRHRT)
+void LAPACK_zgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
 #define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK)
 void LAPACK_cggbak(
    char const* job, char const* side,
@ -4768,7 +4804,7 @@ void LAPACK_chegst(
    lapack_int const* itype, char const* uplo,
    lapack_int const* n,
    lapack_complex_float* A, lapack_int const* lda,
-    lapack_complex_float* B, lapack_int const* ldb,
+    const lapack_complex_float* B, lapack_int const* ldb,
    lapack_int* info );

 #define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST)
@ -4776,7 +4812,7 @@ void LAPACK_zhegst(
    lapack_int const* itype, char const* uplo,
    lapack_int const* n,
    lapack_complex_double* A, lapack_int const* lda,
-    lapack_complex_double* B, lapack_int const* ldb,
+    const lapack_complex_double* B, lapack_int const* ldb,
    lapack_int* info );

 #define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV)
@ -4913,7 +4949,7 @@ void LAPACK_cherfsx(
    lapack_int const* n, lapack_int const* nrhs,
    lapack_complex_float const* A, lapack_int const* lda,
    lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    float* S,
+    const float* S,
    lapack_complex_float const* B, lapack_int const* ldb,
    lapack_complex_float* X, lapack_int const* ldx,
    float* rcond,
@ -4931,7 +4967,7 @@ void LAPACK_zherfsx(
    lapack_int const* n, lapack_int const* nrhs,
    lapack_complex_double const* A, lapack_int const* lda,
    lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    double* S,
+    const double* S,
    lapack_complex_double const* B, lapack_int const* ldb,
    lapack_complex_double* X, lapack_int const* ldx,
    double* rcond,
@ -7251,6 +7287,24 @@ void LAPACK_sorgtr(
    float* work, lapack_int const* lwork,
    lapack_int* info );

+#define LAPACK_dorgtsqr_row LAPACK_GLOBAL(dorgtsqr_row,DORGTSQR_ROW)
+void LAPACK_dorgtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    double* A, lapack_int const* lda,
+    double const* T, lapack_int const* ldt,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorgtsqr_row LAPACK_GLOBAL(sorgtsqr_row,SORGTSQR_ROW)
+void LAPACK_sorgtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    float* A, lapack_int const* lda,
+    float const* T, lapack_int const* ldt,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
 #define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR)
 void LAPACK_dormbr(
    char const* vect, char const* side, char const* trans,
@ -8005,7 +8059,7 @@ void LAPACK_cporfsx(
    lapack_int const* n, lapack_int const* nrhs,
    lapack_complex_float const* A, lapack_int const* lda,
    lapack_complex_float const* AF, lapack_int const* ldaf,
-    float* S,
+    const float* S,
    lapack_complex_float const* B, lapack_int const* ldb,
    lapack_complex_float* X, lapack_int const* ldx,
    float* rcond,
@ -8023,7 +8077,7 @@ void LAPACK_dporfsx(
    lapack_int const* n, lapack_int const* nrhs,
    double const* A, lapack_int const* lda,
    double const* AF, lapack_int const* ldaf,
-    double* S,
+    const double* S,
    double const* B, lapack_int const* ldb,
    double* X, lapack_int const* ldx,
    double* rcond,
@ -8041,7 +8095,7 @@ void LAPACK_sporfsx(
    lapack_int const* n, lapack_int const* nrhs,
    float const* A, lapack_int const* lda,
    float const* AF, lapack_int const* ldaf,
-    float* S,
+    const float* S,
    float const* B, lapack_int const* ldb,
    float* X, lapack_int const* ldx,
    float* rcond,
@ -8059,7 +8113,7 @@ void LAPACK_zporfsx(
    lapack_int const* n, lapack_int const* nrhs,
    lapack_complex_double const* A, lapack_int const* lda,
    lapack_complex_double const* AF, lapack_int const* ldaf,
-    double* S,
+    const double* S,
    lapack_complex_double const* B, lapack_int const* ldb,
    lapack_complex_double* X, lapack_int const* ldx,
    double* rcond,
@ -10756,7 +10810,7 @@ void LAPACK_csyrfsx(
    lapack_int const* n, lapack_int const* nrhs,
    lapack_complex_float const* A, lapack_int const* lda,
    lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    float* S,
+    const float* S,
    lapack_complex_float const* B, lapack_int const* ldb,
    lapack_complex_float* X, lapack_int const* ldx,
    float* rcond,
@ -10774,7 +10828,7 @@ void LAPACK_dsyrfsx(
    lapack_int const* n, lapack_int const* nrhs,
    double const* A, lapack_int const* lda,
    double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    double* S,
+    const double* S,
    double const* B, lapack_int const* ldb,
    double* X, lapack_int const* ldx,
    double* rcond,
@ -10792,7 +10846,7 @@ void LAPACK_ssyrfsx(
    lapack_int const* n, lapack_int const* nrhs,
    float const* A, lapack_int const* lda,
    float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    float* S,
+    const float* S,
    float const* B, lapack_int const* ldb,
    float* X, lapack_int const* ldx,
    float* rcond,
@ -10810,7 +10864,7 @@ void LAPACK_zsyrfsx(
    lapack_int const* n, lapack_int const* nrhs,
    lapack_complex_double const* A, lapack_int const* lda,
    lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    double* S,
+    const double* S,
    lapack_complex_double const* B, lapack_int const* ldb,
    lapack_complex_double* X, lapack_int const* ldx,
    double* rcond,
@ -11556,7 +11610,7 @@ void LAPACK_zsytrs(
 void LAPACK_csytrs2(
    char const* uplo,
    lapack_int const* n, lapack_int const* nrhs,
-    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    const lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
    lapack_complex_float* B, lapack_int const* ldb,
    lapack_complex_float* work,
    lapack_int* info );
@ -11565,7 +11619,7 @@ void LAPACK_csytrs2(
 void LAPACK_dsytrs2(
    char const* uplo,
    lapack_int const* n, lapack_int const* nrhs,
-    double* A, lapack_int const* lda, lapack_int const* ipiv,
+    const double* A, lapack_int const* lda, lapack_int const* ipiv,
    double* B, lapack_int const* ldb,
    double* work,
    lapack_int* info );
@ -11574,7 +11628,7 @@ void LAPACK_dsytrs2(
 void LAPACK_ssytrs2(
    char const* uplo,
    lapack_int const* n, lapack_int const* nrhs,
-    float* A, lapack_int const* lda, lapack_int const* ipiv,
+    const float* A, lapack_int const* lda, lapack_int const* ipiv,
    float* B, lapack_int const* ldb,
    float* work,
    lapack_int* info );
@ -11583,7 +11637,7 @@ void LAPACK_ssytrs2(
 void LAPACK_zsytrs2(
    char const* uplo,
    lapack_int const* n, lapack_int const* nrhs,
-    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    const lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
    lapack_complex_double* B, lapack_int const* ldb,
    lapack_complex_double* work,
    lapack_int* info );
@ -13540,6 +13594,24 @@ void LAPACK_zungtr(
    lapack_complex_double* work, lapack_int const* lwork,
    lapack_int* info );

+#define LAPACK_cungtsqr_row LAPACK_GLOBAL(cungtsqr_row,CUNGTSQR_ROW)
+void LAPACK_cungtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* T, lapack_int const* ldt,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zungtsqr_row LAPACK_GLOBAL(zungtsqr_row,ZUNGTSQR_ROW)
+void LAPACK_zungtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* T, lapack_int const* ldt,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
 #define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR)
 void LAPACK_cunmbr(
    char const* vect, char const* side, char const* trans,
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@ -1867,11 +1867,11 @@ lapack_int LAPACKE_zheevx( int matrix_layout, char jobz, char range, char uplo,

 lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo,
                           lapack_int n, lapack_complex_float* a,
-                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int lda, const lapack_complex_float* b,
                           lapack_int ldb );
 lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo,
                           lapack_int n, lapack_complex_double* a,
-                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int lda, const lapack_complex_double* b,
                           lapack_int ldb );

 lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz,
@ -2598,6 +2598,15 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a,
 lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a,
                           lapack_int lda, const double* tau );

+lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 float* a, lapack_int lda,
+                                 const float* t, lapack_int ldt );
+lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 double* a, lapack_int lda,
+                                 const double* t, lapack_int ldt );
+
 lapack_int LAPACKE_sormbr( int matrix_layout, char vect, char side, char trans,
                           lapack_int m, lapack_int n, lapack_int k,
                           const float* a, lapack_int lda, const float* tau,
@ -4577,6 +4586,15 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n,
                           lapack_complex_double* a, lapack_int lda,
                           const lapack_complex_double* tau );

+lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* t, lapack_int ldt );
+
 lapack_int LAPACKE_cunmbr( int matrix_layout, char vect, char side, char trans,
                           lapack_int m, lapack_int n, lapack_int k,
                           const lapack_complex_float* a, lapack_int lda,
@ -6932,11 +6950,11 @@ lapack_int LAPACKE_zheevx_work( int matrix_layout, char jobz, char range,

 lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo,
                                lapack_int n, lapack_complex_float* a,
-                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int lda, const lapack_complex_float* b,
                                lapack_int ldb );
 lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo,
                                lapack_int n, lapack_complex_double* a,
-                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int lda, const lapack_complex_double* b,
                                lapack_int ldb );

 lapack_int LAPACKE_chegv_work( int matrix_layout, lapack_int itype, char jobz,
@ -7880,6 +7898,19 @@ lapack_int LAPACKE_dorgtr_work( int matrix_layout, char uplo, lapack_int n,
                                double* a, lapack_int lda, const double* tau,
                                double* work, lapack_int lwork );

+lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      float* a, lapack_int lda,
+                                      const float* t, lapack_int ldt,
+                                      float* work, lapack_int lwork );
+lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      double* a, lapack_int lda,
+                                      const double* t, lapack_int ldt,
+                                      double* work, lapack_int lwork );
+
 lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side,
                                char trans, lapack_int m, lapack_int n,
                                lapack_int k, const float* a, lapack_int lda,
@ -10281,6 +10312,19 @@ lapack_int LAPACKE_zungtr_work( int matrix_layout, char uplo, lapack_int n,
                                const lapack_complex_double* tau,
                                lapack_complex_double* work, lapack_int lwork );

+lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_float* a, lapack_int lda,
+                                      const lapack_complex_float* t, lapack_int ldt,
+                                      lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_double* a, lapack_int lda,
+                                      const lapack_complex_double* t, lapack_int ldt,
+                                      lapack_complex_double* work, lapack_int lwork );
+
 lapack_int LAPACKE_cunmbr_work( int matrix_layout, char vect, char side,
                                char trans, lapack_int m, lapack_int n,
                                lapack_int k, const lapack_complex_float* a,
@ -10553,11 +10597,11 @@ lapack_int LAPACKE_csytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                  const lapack_int* ipiv,
                                  lapack_complex_float* work, lapack_int nb );
 lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_float* a,
+                            lapack_int nrhs, const lapack_complex_float* a,
                            lapack_int lda, const lapack_int* ipiv,
                            lapack_complex_float* b, lapack_int ldb );
 lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, lapack_complex_float* a,
+                                 lapack_int nrhs, const lapack_complex_float* a,
                                 lapack_int lda, const lapack_int* ipiv,
                                 lapack_complex_float* b, lapack_int ldb,
                                 lapack_complex_float* work );
@ -10718,10 +10762,10 @@ lapack_int LAPACKE_dsytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                  const lapack_int* ipiv, double* work,
                                  lapack_int nb );
 lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, double* a, lapack_int lda,
+                            lapack_int nrhs, const double* a, lapack_int lda,
                            const lapack_int* ipiv, double* b, lapack_int ldb );
 lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, double* a,
+                                 lapack_int nrhs, const double* a,
                                 lapack_int lda, const lapack_int* ipiv,
                                 double* b, lapack_int ldb, double* work );
 lapack_int LAPACKE_sbbcsd( int matrix_layout, char jobu1, char jobu2,
@ -10813,10 +10857,10 @@ lapack_int LAPACKE_ssytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                  const lapack_int* ipiv, float* work,
                                  lapack_int nb );
 lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, float* a, lapack_int lda,
+                            lapack_int nrhs, const float* a, lapack_int lda,
                            const lapack_int* ipiv, float* b, lapack_int ldb );
 lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, float* a,
+                                 lapack_int nrhs, const float* a,
                                 lapack_int lda, const lapack_int* ipiv,
                                 float* b, lapack_int ldb, float* work );
 lapack_int LAPACKE_zbbcsd( int matrix_layout, char jobu1, char jobu2,
@ -10898,11 +10942,11 @@ lapack_int LAPACKE_zsytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                  const lapack_int* ipiv,
                                  lapack_complex_double* work, lapack_int nb );
 lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_double* a,
+                            lapack_int nrhs, const lapack_complex_double* a,
                            lapack_int lda, const lapack_int* ipiv,
                            lapack_complex_double* b, lapack_int ldb );
 lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, lapack_complex_double* a,
+                                 lapack_int nrhs, const lapack_complex_double* a,
                                 lapack_int lda, const lapack_int* ipiv,
                                 lapack_complex_double* b, lapack_int ldb,
                                 lapack_complex_double* work );
@ -12026,6 +12070,44 @@ lapack_int LAPACKE_zgetsls_work( int matrix_layout, char trans, lapack_int m,
                                 lapack_complex_double* b, lapack_int ldb,
                                 lapack_complex_double* work, lapack_int lwork );

+lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               float* a, lapack_int lda,
+                               float* t, lapack_int ldt );
+lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               double* a, lapack_int lda,
+                               double* t, lapack_int ldt );
+lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    float* a, lapack_int lda,
+                                    float* t, lapack_int ldt,
+                                    float* work, lapack_int lwork );
+lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    double* a, lapack_int lda,
+                                    double* t, lapack_int ldt,
+                                    double* work, lapack_int lwork );
+lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_float* a, lapack_int lda,
+                                    lapack_complex_float* t, lapack_int ldt,
+                                    lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_double* a, lapack_int lda,
+                                    lapack_complex_double* t, lapack_int ldt,
+                                    lapack_complex_double* work, lapack_int lwork );
+
 lapack_int LAPACKE_ssyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n,
                          float* a, lapack_int lda, float* w );
 lapack_int LAPACKE_dsyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n,
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@ -162,6 +162,8 @@ lapacke_cgetrs.o \
 lapacke_cgetrs_work.o \
 lapacke_cgetsls.o \
 lapacke_cgetsls_work.o \
+lapacke_cgetsqrhrt.o \
+lapacke_cgetsqrhrt_work.o \
 lapacke_cggbak.o \
 lapacke_cggbak_work.o \
 lapacke_cggbal.o \
@ -634,6 +636,8 @@ lapacke_cungrq.o \
 lapacke_cungrq_work.o \
 lapacke_cungtr.o \
 lapacke_cungtr_work.o \
+lapacke_cungtsqr_row.o \
+lapacke_cungtsqr_row_work.o \
 lapacke_cunmbr.o \
 lapacke_cunmbr_work.o \
 lapacke_cunmhr.o \
@ -778,6 +782,8 @@ lapacke_dgetrs.o \
 lapacke_dgetrs_work.o \
 lapacke_dgetsls.o \
 lapacke_dgetsls_work.o \
+lapacke_dgetsqrhrt.o \
+lapacke_dgetsqrhrt_work.o \
 lapacke_dggbak.o \
 lapacke_dggbak_work.o \
 lapacke_dggbal.o \
@ -900,6 +906,8 @@ lapacke_dorgrq.o \
 lapacke_dorgrq_work.o \
 lapacke_dorgtr.o \
 lapacke_dorgtr_work.o \
+lapacke_dorgtsqr_row.o \
+lapacke_dorgtsqr_row_work.o \
 lapacke_dormbr.o \
 lapacke_dormbr_work.o \
 lapacke_dormhr.o \
@ -1348,6 +1356,8 @@ lapacke_sgetrs.o \
 lapacke_sgetrs_work.o \
 lapacke_sgetsls.o \
 lapacke_sgetsls_work.o \
+lapacke_sgetsqrhrt.o \
+lapacke_sgetsqrhrt_work.o \
 lapacke_sggbak.o \
 lapacke_sggbak_work.o \
 lapacke_sggbal.o \
@ -1468,6 +1478,8 @@ lapacke_sorgrq.o \
 lapacke_sorgrq_work.o \
 lapacke_sorgtr.o \
 lapacke_sorgtr_work.o \
+lapacke_sorgtsqr_row.o \
+lapacke_sorgtsqr_row_work.o \
 lapacke_sormbr.o \
 lapacke_sormbr_work.o \
 lapacke_sormhr.o \
@ -1908,6 +1920,8 @@ lapacke_zgetrs.o \
 lapacke_zgetrs_work.o \
 lapacke_zgetsls.o \
 lapacke_zgetsls_work.o \
+lapacke_zgetsqrhrt.o \
+lapacke_zgetsqrhrt_work.o \
 lapacke_zggbak.o \
 lapacke_zggbak_work.o \
 lapacke_zggbal.o \
@ -2380,6 +2394,8 @@ lapacke_zungrq.o \
 lapacke_zungrq_work.o \
 lapacke_zungtr.o \
 lapacke_zungtr_work.o \
+lapacke_zungtsqr_row.o \
+lapacke_zungtsqr_row_work.o \
 lapacke_zunmbr.o \
 lapacke_zunmbr_work.o \
 lapacke_zunmhr.o \
--- a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c
@ -56,6 +56,8 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt,
                             ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
        lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n :
                              ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1);
+        lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) ||
+                               LAPACKE_lsame( jobvt, 's' ) ) ? n : 1;
        lapack_int lda_t = MAX(1,m);
        lapack_int ldu_t = MAX(1,nrows_u);
        lapack_int ldvt_t = MAX(1,nrows_vt);
@ -73,7 +75,7 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt,
            LAPACKE_xerbla( "LAPACKE_cgesvd_work", info );
            return info;
        }
-        if( ldvt < n ) {
+        if( ldvt < ncols_vt ) {
            info = -12;
            LAPACKE_xerbla( "LAPACKE_cgesvd_work", info );
            return info;
--- a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c
@ -0,0 +1,80 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_complex_float* work = NULL;
+    lapack_complex_float work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -7;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = LAPACK_C2INT( work_query );
+    /* Allocate memory for work arrays */
+    work = (lapack_complex_float*)
+        LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", info );
+    }
+    return info;
+}
--- a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c
@ -0,0 +1,108 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_float* a, lapack_int lda,
+                                    lapack_complex_float* t, lapack_int ldt,
+                                    lapack_complex_float* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_complex_float* a_t = NULL;
+        lapack_int ldt_t = MAX(1,nb2);
+        lapack_complex_float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
+                               work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+    }
+    return info;
+}
--- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
@ -78,7 +78,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo,
            info = info - 1;
        }
        /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
            LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
        } else {
            LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo,
            info = info - 1;
        }
        /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
            LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
        } else {
            LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); 
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo,
            info = info - 1;
        }
        /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
            LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
        } else { 
            LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
--- a/lapack-netlib/LAPACKE/src/lapacke_chegst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegst.c
@ -35,7 +35,7 @@

 lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo,
                           lapack_int n, lapack_complex_float* a,
-                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int lda, const lapack_complex_float* b,
                           lapack_int ldb )
 {
    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
--- a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
@ -35,7 +35,7 @@

 lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo,
                                lapack_int n, lapack_complex_float* a,
-                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int lda, const lapack_complex_float* b,
                                lapack_int ldb )
 {
    lapack_int info = 0;
--- a/lapack-netlib/LAPACKE/src/lapacke_chegv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegv.c
@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -6;
        }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
            return -8;
        }
    }
--- a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c
@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv_2stage( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -6;
        }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
            return -8;
        }
    }
--- a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
@ -55,10 +55,10 @@ lapack_int LAPACKE_chegvd( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -6;
        }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
            return -8;
        }
    }
--- a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c
@ -60,7 +60,7 @@ lapack_int LAPACKE_chegvx( int matrix_layout, lapack_int itype, char jobz,
        if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) {
            return -15;
        }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
            return -9;
        }
        if( LAPACKE_lsame( range, 'v' ) ) {
--- a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c
@ -46,7 +46,7 @@ lapack_int LAPACKE_chetri2x( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -4;
        }
    }
--- a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c
@ -42,9 +42,6 @@ lapack_int LAPACKE_clacpy_work( int matrix_layout, char uplo, lapack_int m,
    if( matrix_layout == LAPACK_COL_MAJOR ) {
        /* Call LAPACK function and adjust info */
        LAPACK_clacpy( &uplo, &m, &n, a, &lda, b, &ldb );
-        if( info < 0 ) {
-            info = info - 1;
-        }
    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
        lapack_int lda_t = MAX(1,m);
        lapack_int ldb_t = MAX(1,m);
--- a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
@ -41,45 +41,46 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo,
    lapack_int info = 0;
    float res = 0.;
    if( matrix_layout == LAPACK_COL_MAJOR ) {
-        /* Call LAPACK function and adjust info */
+        /* Call LAPACK function */
        res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,m);
-        lapack_complex_float* a_t = NULL;
        float* work_lapack = NULL;
+        char norm_lapack;
+        char uplo_lapack;
        /* Check leading dimension(s) */
        if( lda < n ) {
            info = -8;
            LAPACKE_xerbla( "LAPACKE_clantr_work", info );
            return info;
        }
-        /* Allocate memory for temporary array(s) */
-        a_t = (lapack_complex_float*)
-            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,MAX(m,n)) );
-        if( a_t == NULL ) {
-            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
-            goto exit_level_0;
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        if( LAPACKE_lsame( uplo, 'u' ) ) {
+            uplo_lapack = 'l';
+        } else {
+            uplo_lapack = 'u';
        }
        /* Allocate memory for work array(s) */
-        if( LAPACKE_lsame( norm, 'i' ) ) {
-            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) );
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
            if( work_lapack == NULL ) {
                info = LAPACK_WORK_MEMORY_ERROR;
-                goto exit_level_1;
+                goto exit_level_0;
            }
        }
-        /* Transpose input matrices */
-        LAPACKE_ctr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
-        /* Call LAPACK function and adjust info */
-        res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
+        /* Call LAPACK function */
+        res = LAPACK_clantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack );
        /* Release memory and exit */
        if( work_lapack ) {
            LAPACKE_free( work_lapack );
        }
-exit_level_1:
-        LAPACKE_free( a_t );
 exit_level_0:
-        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+        if( info == LAPACK_WORK_MEMORY_ERROR ) {
            LAPACKE_xerbla( "LAPACKE_clantr_work", info );
        }
    } else {
--- a/lapack-netlib/LAPACKE/src/lapacke_clascl.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clascl.c
@ -83,6 +83,7 @@ lapack_int LAPACKE_clascl( int matrix_layout, char type, lapack_int kl,
                LAPACKE_cgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) {
                return -9;
            }
+            break;
        case 'B':
            // TYPE = 'B' - lower part of symmetric band matrix (assume m==n)
            if( LAPACKE_chb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) {
--- a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c
@ -42,9 +42,6 @@ lapack_int LAPACKE_claset_work( int matrix_layout, char uplo, lapack_int m,
    if( matrix_layout == LAPACK_COL_MAJOR ) {
        /* Call LAPACK function and adjust info */
        LAPACK_claset( &uplo, &m, &n, &alpha, &beta, a, &lda );
-        if( info < 0 ) {
-            info = info - 1;
-        }
    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
        lapack_int lda_t = MAX(1,m);
        lapack_complex_float* a_t = NULL;
--- a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c
@ -45,7 +45,7 @@ lapack_int LAPACKE_csyconv( int matrix_layout, char uplo, char way, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_csy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -5;
        }
    }
--- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
@ -34,7 +34,7 @@
 #include "lapacke_utils.h"

 lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_float* a,
+                            lapack_int nrhs, const lapack_complex_float* a,
                            lapack_int lda, const lapack_int* ipiv,
                            lapack_complex_float* b, lapack_int ldb )
 {
--- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
@ -34,7 +34,7 @@
 #include "lapacke_utils.h"

 lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, lapack_complex_float* a,
+                                 lapack_int nrhs, const lapack_complex_float* a,
                                 lapack_int lda, const lapack_int* ipiv,
                                 lapack_complex_float* b, lapack_int ldb,
                                 lapack_complex_float* work )
--- a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c
@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttf( int matrix_layout, char transr, char uplo,
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
            return -5;
        }
    }
--- a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c
@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttp( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
            return -4;
        }
    }
--- a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c
@ -48,7 +48,7 @@ lapack_int LAPACKE_cungtr( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -4;
        }
        if( LAPACKE_c_nancheck( n-1, tau, 1 ) ) {
--- a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c
@ -0,0 +1,83 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cungtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_complex_float* work = NULL;
+    lapack_complex_float work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cungtsqr_row", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
+            return -8;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = LAPACK_C2INT( work_query );
+    /* Allocate memory for work arrays */
+    work = (lapack_complex_float*)
+        LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cungtsqr_row", info );
+    }
+    return info;
+}
--- a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c
@ -0,0 +1,109 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cungtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_float* a, lapack_int lda,
+                                      const lapack_complex_float* t, lapack_int ldt,
+                                      lapack_complex_float* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if (matrix_layout == LAPACK_COL_MAJOR) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
+                             work, &lwork, &info);
+        if (info < 0) {
+            info = info - 1;
+        }
+    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_complex_float* a_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+            return info;
+        }
+        lapack_int ldt_t = MAX(1,nb);
+        lapack_complex_float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldt < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
+                                 work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        LAPACKE_cge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
+                             work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+    }
+    return info;
+}
--- a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c
@ -52,7 +52,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans,
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
        r = LAPACKE_lsame( side, 'l' ) ? m : n;
-        if( LAPACKE_cge_nancheck( matrix_layout, r, r, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, r, a, lda ) ) {
            return -7;
        }
        if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
--- a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c
@ -54,6 +54,8 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt,
                             ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
        lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n :
                              ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1);
+        lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) ||
+                               LAPACKE_lsame( jobvt, 's' ) ) ? n : 1;
        lapack_int lda_t = MAX(1,m);
        lapack_int ldu_t = MAX(1,nrows_u);
        lapack_int ldvt_t = MAX(1,nrows_vt);
@ -71,7 +73,7 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt,
            LAPACKE_xerbla( "LAPACKE_dgesvd_work", info );
            return info;
        }
-        if( ldvt < n ) {
+        if( ldvt < ncols_vt ) {
            info = -12;
            LAPACKE_xerbla( "LAPACKE_dgesvd_work", info );
            return info;
--- a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c
@ -0,0 +1,79 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               double* a, lapack_int lda,
+                               double* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    double* work = NULL;
+    double work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -7;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = (lapack_int)work_query;
+    /* Allocate memory for work arrays */
+    work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", info );
+    }
+    return info;
+}
--- a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c
@ -0,0 +1,106 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    double* a, lapack_int lda,
+                                    double* t, lapack_int ldt,
+                                    double* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        double* a_t = NULL;
+        lapack_int ldt_t = MAX(1,nb2);
+        double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
+                               work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+    }
+    return info;
+}
--- a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c
@ -41,9 +41,6 @@ lapack_int LAPACKE_dlacpy_work( int matrix_layout, char uplo, lapack_int m,
    if( matrix_layout == LAPACK_COL_MAJOR ) {
        /* Call LAPACK function and adjust info */
        LAPACK_dlacpy( &uplo, &m, &n, a, &lda, b, &ldb );
-        if( info < 0 ) {
-            info = info - 1;
-        }
    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
        lapack_int lda_t = MAX(1,m);
        lapack_int ldb_t = MAX(1,m);
--- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
@ -40,44 +40,46 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo,
    lapack_int info = 0;
    double res = 0.;
    if( matrix_layout == LAPACK_COL_MAJOR ) {
-        /* Call LAPACK function and adjust info */
+        /* Call LAPACK function */
        res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,m);
-        double* a_t = NULL;
        double* work_lapack = NULL;
+        char norm_lapack;
+        char uplo_lapack;
        /* Check leading dimension(s) */
        if( lda < n ) {
            info = -8;
            LAPACKE_xerbla( "LAPACKE_dlantr_work", info );
            return info;
        }
-        /* Allocate memory for temporary array(s) */
-        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,MAX(m,n)) );
-        if( a_t == NULL ) {
-            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
-            goto exit_level_0;
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        if( LAPACKE_lsame( uplo, 'u' ) ) {
+            uplo_lapack = 'l';
+        } else {
+            uplo_lapack = 'u';
        }
        /* Allocate memory for work array(s) */
-        if( LAPACKE_lsame( norm, 'i' ) ) {
-            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) );
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) );
            if( work_lapack == NULL ) {
                info = LAPACK_WORK_MEMORY_ERROR;
-                goto exit_level_1;
+                goto exit_level_0;
            }
        }
-        /* Transpose input matrices */
-        LAPACKE_dtr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
-        /* Call LAPACK function and adjust info */
-        res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
+        /* Call LAPACK function */
+        res = LAPACK_dlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack );
        /* Release memory and exit */
        if( work_lapack ) {
            LAPACKE_free( work_lapack );
        }
-exit_level_1:
-        LAPACKE_free( a_t );
 exit_level_0:
-        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+        if( info == LAPACK_WORK_MEMORY_ERROR ) {
            LAPACKE_xerbla( "LAPACKE_dlantr_work", info );
        }
    } else {
--- a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c
@ -83,6 +83,7 @@ lapack_int LAPACKE_dlascl( int matrix_layout, char type, lapack_int kl,
                LAPACKE_dgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) {
                return -9;
            }
+            break;
        case 'B':
            // TYPE = 'B' - lower part of symmetric band matrix (assume m==n)
            if( LAPACKE_dsb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) {
--- a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c
@ -41,9 +41,6 @@ lapack_int LAPACKE_dlaset_work( int matrix_layout, char uplo, lapack_int m,
    if( matrix_layout == LAPACK_COL_MAJOR ) {
        /* Call LAPACK function and adjust info */
        LAPACK_dlaset( &uplo, &m, &n, &alpha, &beta, a, &lda );
-        if( info < 0 ) {
-            info = info - 1;
-        }
    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
        lapack_int lda_t = MAX(1,m);
        double* a_t = NULL;
--- a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c
@ -47,7 +47,7 @@ lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -4;
        }
        if( LAPACKE_d_nancheck( n-1, tau, 1 ) ) {
--- a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c
@ -0,0 +1,82 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dorgtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 double* a, lapack_int lda,
+                                 const double* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    double* work = NULL;
+    double work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
+            return -8;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = (lapack_int)work_query;
+    /* Allocate memory for work arrays */
+    work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", info );
+    }
+    return info;
+}
--- a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c
@ -0,0 +1,108 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dorgtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      double* a, lapack_int lda,
+                                      const double* t, lapack_int ldt,
+                                      double* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if (matrix_layout == LAPACK_COL_MAJOR) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
+                             work, &lwork, &info);
+        if (info < 0) {
+            info = info - 1;
+        }
+    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
+        lapack_int lda_t = MAX(1,m);
+        double* a_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+            return info;
+        }
+        lapack_int ldt_t = MAX(1,nb);
+        double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldt < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
+                                 work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        LAPACKE_dge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
+                             work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+    }
+    return info;
+}
--- a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c
@ -51,7 +51,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans,
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
        r = LAPACKE_lsame( side, 'l' ) ? m : n;
-        if( LAPACKE_dge_nancheck( matrix_layout, r, r, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, r, a, lda ) ) {
            return -7;
        }
        if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c
@ -43,7 +43,7 @@ lapack_int LAPACKE_dsyconv( int matrix_layout, char uplo, char way, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -5;
        }
    }
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
@ -72,7 +72,7 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo,
            info = info - 1;
        }
        /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
            LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
        } else {
            LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo,
            info = info - 1;
        }
        /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
            LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
        } else {
            LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo,
            info = info - 1;
        }
        /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
            LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
        } else {
            LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c
@ -47,7 +47,7 @@ lapack_int LAPACKE_dsygst( int matrix_layout, lapack_int itype, char uplo,
        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -5;
        }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
            return -7;
        }
    }
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c
@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -6;
        }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
            return -8;
        }
    }
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c
@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv_2stage( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -6;
        }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
            return -8;
        }
    }
--- a/Show More
+++ b/Show More