update

2020-01-02 11:01:57 +08:00 · 2020-01-02 11:01:57 +08:00 · 80db5f11e1
parent 8d84403205 44028581cc
commit 80db5f11e1
831 changed files with 37975 additions and 12110 deletions
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -178,4 +178,4 @@ In chronological order:
  * [2019-11-06] optimize AVX512 SGEMM
  * [2019-11-12] AVX512 CGEMM & ZGEMM kernels
  * [2019-12-23] optimize AVX2 CGEMM and ZGEMM
-  * [2019-12-27] AVX2 CGEMM3M kernel
+  * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
--- a/20
+++ b/20
@ -247,21 +247,21 @@ prof_lapack : lapack_prebuild

 lapack_prebuild :
 ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-	-@echo "FORTRAN     = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "OPTS        = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "FC          = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "FFLAGS      = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "FFLAGS_NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "PNOOPT      = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "LOADOPTS    = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "LDFLAGS     = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "override CFLAGS      = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "override ARCH        = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "ARCHFLAGS   = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "AR          = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "ARFLAGS     = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "RANLIB      = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "LAPACKLIB   = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "TMGLIB      = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "LAPACKLIB   = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "TMGLIB      = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "BLASLIB     = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "LAPACKELIB  = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "LAPACKELIB  = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "SUFFIX      = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "PSUFFIX     = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
@ -319,7 +319,7 @@ lapack-test :
 ifneq ($(CROSS), 1)
 	( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
        ./testsecond; ./testdsecnd; ./testieee; ./testversion )
-	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
+	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
 endif

 lapack-runtest:
--- a/Makefile.system
+++ b/Makefile.system
@ -25,6 +25,8 @@ else ifeq ($(ARCH), i386)
 override ARCH=x86
 else ifeq ($(ARCH), aarch64)
 override ARCH=arm64
+else ifeq ($(ARCH), zarch)
+override ARCH=zarch
 endif

 NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
@ -558,6 +560,11 @@ DYNAMIC_CORE += THUNDERX2T99
 DYNAMIC_CORE += TSV110
 endif

+ifeq ($(ARCH), zarch)
+DYNAMIC_CORE = Z13
+DYNAMIC_CORE += Z14
+endif
+
 ifeq ($(ARCH), power)
 DYNAMIC_CORE = POWER6
 DYNAMIC_CORE += POWER8
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@ -115,7 +115,9 @@ set(SLASRC
   stplqt.f stplqt2.f stpmlqt.f
   ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
   ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
-   ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f)
+   ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
+   scombssq.f sgesvdq.f slaorhr_col_getrfnp.f
+   slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )

 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
   sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@ -210,7 +212,9 @@ set(CLASRC
   ctplqt.f ctplqt2.f ctpmlqt.f
   chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
   cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
-   chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f)
+   chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
+   cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
+   cungtsqr.f cunhr_col.f )

 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
   cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@ -299,7 +303,9 @@ set(DLASRC
   dtplqt.f dtplqt2.f dtpmlqt.f
   dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
   dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
-   dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f)
+   dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
+   dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
+   dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )

 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
   dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@ -398,7 +404,9 @@ set(ZLASRC
   zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
   zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
   zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
-   zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f)
+   zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
+   zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
+   zungtsqr.f zunhr_col.f)

 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
   zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@ -715,6 +715,8 @@ set(DSRC
  lapacke_dgesv_work.c
  lapacke_dgesvd.c
  lapacke_dgesvd_work.c
+  lapacke_dgesvdq.c
+  lapacke_dgesvdq_work.c
  lapacke_dgesvdx.c
  lapacke_dgesvdx_work.c
  lapacke_dgesvj.c
@ -1287,6 +1289,8 @@ set(SSRC
  lapacke_sgesv_work.c
  lapacke_sgesvd.c
  lapacke_sgesvd_work.c
+  lapacke_sgesvdq.c
+  lapacke_sgesvdq_work.c
  lapacke_sgesvdx.c
  lapacke_sgesvdx_work.c
  lapacke_sgesvj.c
@ -1853,6 +1857,8 @@ set(ZSRC
  lapacke_zgesv_work.c
  lapacke_zgesvd.c
  lapacke_zgesvd_work.c
+  lapacke_zgesvdq.c
+  lapacke_zgesvdq_work.c
  lapacke_zgesvdx.c
  lapacke_zgesvdx_work.c
  lapacke_zgesvj.c
--- a/ctest/din3
+++ b/ctest/din3
@ -5,7 +5,7 @@ T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
-7                 NUMBER OF VALUES OF N
+6                 NUMBER OF VALUES OF N
 1 2 3 5 7 9 35    VALUES OF N
 3                 NUMBER OF VALUES OF ALPHA
 0.0 1.0 0.7       VALUES OF ALPHA
--- a/ctest/sin3
+++ b/ctest/sin3
@ -5,7 +5,7 @@ T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
-7                 NUMBER OF VALUES OF N
+6                 NUMBER OF VALUES OF N
 0 1 2 3 5 9 35    VALUES OF N
 3                 NUMBER OF VALUES OF ALPHA
 0.0 1.0 0.7       VALUES OF ALPHA
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@ -21,9 +21,13 @@ else
 ifeq ($(ARCH),power)
 COMMONOBJS	+=  dynamic_power.$(SUFFIX)
 else
+ifeq ($(ARCH),zarch)
+COMMONOBJS += dynamic_zarch.$(SUFFIX)
+else
 COMMONOBJS	+=  dynamic.$(SUFFIX)
 endif
 endif
+endif
 else
 COMMONOBJS	+=  parameter.$(SUFFIX)
 endif
@ -85,9 +89,13 @@ else
 ifeq ($(ARCH),power)
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
 else
+ifeq ($(ARCH),zarch)
+HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
+else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
 endif
 endif
+endif
 else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
 endif
--- a/driver/others/dynamic_zarch.c
+++ b/driver/others/dynamic_zarch.c
@ -0,0 +1,131 @@
+
+#include "common.h"
+
+extern gotoblas_t gotoblas_Z13;
+extern gotoblas_t gotoblas_Z14;
+extern gotoblas_t gotoblas_Z15;
+//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
+//extern gotoblas_t gotoblas_Z14;
+//#endif
+
+#define NUM_CORETYPES 5
+
+extern void openblas_warning(int verbose, const char* msg);
+
+static char* corename[] = {
+	"unknown",
+	"Z13",
+	"Z14",
+	"Z15",
+	"ZARCH_GENERIC",
+};
+
+char* gotoblas_corename(void) {
+	if (gotoblas == &gotoblas_Z13)	return corename[1];
+	if (gotoblas == &gotoblas_Z14)	return corename[2];
+	if (gotoblas == &gotoblas_Z15)	return corename[3];
+//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
+//	if (gotoblas == &gotoblas_POWER9)	return corename[3];
+//#endif
+	return corename[0]; // try generic?
+}
+
+// __builtin_cpu_is is not supported by zarch
+static gotolabs_t* get_coretype(void) {
+	FILE* infile;
+	char buffer[512], * p;
+
+	p = (char*)NULL;
+	infile = fopen("/proc/sysinfo", "r");
+	while (fgets(buffer, sizeof(buffer), infile)) {
+		if (!strncmp("Type", buffer, 4)) {
+			p = strchr(buffer, ':') + 2;
+#if 0
+			fprintf(stderr, "%s\n", p);
+#endif
+			break;
+		}
+	}
+
+	fclose(infile);
+
+	if (strstr(p, "2964")) return &gotoblas_Z13;
+	if (strstr(p, "2965")) return &gotoblas_Z13;
+	if (strstr(p, "3906")) return &gotoblas_Z14;
+	if (strstr(p, "3907")) return &gotoblas_Z14;
+	if (strstr(p, "8561")) return &gotoblas_Z14;        // fallback z15 to z14
+	if (strstr(p, "8562")) return &gotoblas_Z14;        // fallback z15 to z14
+
+	return NULL; // should be ZARCH_GENERIC
+}
+
+static gotoblas_t* force_coretype(char* coretype) {
+
+	int i;
+	int found = -1;
+	char message[128];
+
+	for (i = 0; i < NUM_CORETYPES; i++)
+	{
+		if (!strncasecmp(coretype, corename[i], 20))
+		{
+			found = i;
+			break;
+		}
+	}
+
+	switch (found)
+	{
+	case  1: return (&gotoblas_Z13);
+	case  2: return (&gotoblas_Z14);
+	case  3: return (&gotoblas_Z15);
+//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
+//	case  3: return (&gotoblas_POWER9);
+//#endif
+	default: return NULL;
+	}
+	snprintf(message, 128, "Core not found: %s\n", coretype);
+	openblas_warning(1, message);
+}
+
+void gotoblas_dynamic_init(void) {
+
+	char coremsg[128];
+	char coren[22];
+	char* p;
+
+
+	if (gotoblas) return;
+
+	p = getenv("OPENBLAS_CORETYPE");
+	if (p)
+	{
+		gotoblas = force_coretype(p);
+	}
+	else
+	{
+		gotoblas = get_coretype();
+	}
+
+	if (gotoblas == NULL)
+	{
+		snprintf(coremsg, 128, "Falling back to Z14 core\n");
+		openblas_warning(1, coremsg);
+		gotoblas = &gotoblas_Z14;
+	}
+
+	if (gotoblas && gotoblas->init) {
+		strncpy(coren, gotoblas_corename(), 20);
+		sprintf(coremsg, "Core: %s\n", coren);
+		openblas_warning(2, coremsg);
+		gotoblas->init();
+	}
+	else {
+		openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+		exit(1);
+	}
+}
+
+void gotoblas_dynamic_quit(void) {
+	gotoblas = NULL;
+}
--- a/exports/gensymbol
+++ b/exports/gensymbol
@ -694,7 +694,19 @@
    
    # functions added for lapack-3.8.0

-    ilaenv2stage
+    ilaenv2stage,
+
+    # functions added for lapack-3.9.0
+    cgesvdq,
+    cungtsqr,
+    dcombssq,
+    dgesvdq,
+    dorgtsqr,
+    scombssq,
+    sgesvdq,
+    sorgtsqr,
+    zgesvdq,
+    zungtsqr
 );

@lapack_extendedprecision_objs = (
@ -3347,6 +3359,15 @@
    LAPACKE_zsytrf_aa_2stage_work,
    LAPACKE_zsytrs_aa_2stage,
    LAPACKE_zsytrs_aa_2stage_work,
+    
+    # new functions from 3.9.0
+    LAPACKE_dgesvdq,
+    LAPACKE_dgesvdq_work,
+    LAPACKE_sgesvdq,
+    LAPACKE_sgesvdq_work,
+    LAPACKE_zgesvdq,
+    LAPACKE_zgesvdq_work
+
 );

 #These function may need 2 underscores.
@ -3419,7 +3440,13 @@
    dsytrf_aa_2stage, dsytrs_aa_2stage,
    zhesv_aa_2stage, zhetrf_aa_2stage,
    zhetrs_aa_2stage, zsysv_aa_2stage,
-    zsytrf_aa_2stage, zsytrs_aa_2stage
+    zsytrf_aa_2stage, zsytrs_aa_2stage,
+# 3.9.0
+    claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col,
+    dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col,
+    slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col,
+    zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col
+
 );


--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@ -103,26 +103,34 @@ ZDOTKERNEL     = zdot.S
 DSDOTKERNEL    = dot.S

 DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S

 SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-ifeq ($(SGEMM_UNROLL_N), 4)
-SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
 else
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
 endif
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
-
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
 ifeq ($(SGEMM_UNROLL_N), 4)
 SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
 else
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
 endif
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)

--- a/kernel/arm64/KERNEL.TSV110
+++ b/kernel/arm64/KERNEL.TSV110
@ -109,22 +109,29 @@ ZGEMVTKERNEL = zgemv_t.S
 SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-ifeq ($(SGEMM_UNROLL_N), 4)
-SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
 else
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
 endif
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
-
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
 ifeq ($(SGEMM_UNROLL_N), 4)
 SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
 else
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
 endif
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)

--- a/kernel/arm64/dgemm_beta.S
+++ b/kernel/arm64/dgemm_beta.S
@ -43,7 +43,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define betaV0		v11.d[0]
 #define I	x16

-#define size 128
+#define prfm_size 640
+#define calc_size 128

 /**************************************************************************************
 * Macro definitions
@ -126,20 +127,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	fmul v2.2d, v2.2d, betaV0
 	fmul v3.2d, v3.2d, betaV0

+	prfm    PLDL1KEEP, [A01, prfm_size]
+
 	fmul v4.2d, v4.2d, betaV0
 	fmul v5.2d, v5.2d, betaV0

+	prfm    PLDL1KEEP, [A03, prfm_size]
+
 	fmul v6.2d, v6.2d, betaV0
 	fmul v7.2d, v7.2d, betaV0

 	st1	{v0.2d, v1.2d}, [A01]
-	add	A01, A01, size
+	add	A01, A01, calc_size
 	st1	{v2.2d, v3.2d}, [A02]
-	add	A02, A02, size
+	add	A02, A02, calc_size
 	st1	{v4.2d, v5.2d}, [A03]
-	add	A03, A03, size
+	add	A03, A03, calc_size
 	st1	{v6.2d, v7.2d}, [A04]
-	add	A04, A04, size
+	add	A04, A04, calc_size

 	subs    I , I , #1
 	bne	.Lgemm_beta_03
--- a/kernel/arm64/sgemm_beta.S
+++ b/kernel/arm64/sgemm_beta.S
@ -0,0 +1,259 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M	x0
+#define	N	x1
+#define	BETA    s0
+#define	LDC     x6
+#define	C00	x7
+
+#define	A01	x8
+#define	A02	x9
+#define	A03	x10
+#define	A04	x11
+#define I  	x12
+
+#define beta0		s11
+#define betaV0		v11.s[0]
+
+#define prfm_size 640
+#define calc_size 128
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+.macro INIT_ZERO
+	fmul v0.4s, v0.4s, betaV0
+	fmul v1.4s, v1.4s, betaV0
+	fmul v2.4s, v2.4s, betaV0
+	fmul v3.4s, v3.4s, betaV0
+	fmul v4.4s, v4.4s, betaV0
+	fmul v5.4s, v5.4s, betaV0
+	fmul v6.4s, v6.4s, betaV0
+	fmul v7.4s, v7.4s, betaV0
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	ldr LDC, [sp]
+	SAVE_REGS
+
+.Lgemm_beta_BEGIN:
+
+	fmov beta0, BETA
+	cmp 	N, #0
+	ble	.Lgemm_beta_L999
+
+	fcmp BETA, #0.0
+	beq .Lgemm_beta_zero_01
+
+.Lgemm_beta_01:
+
+	lsl LDC, LDC, #2
+
+	.align 5
+.Lgemm_beta_02:
+
+	mov	A01, C00
+	add C00, C00, LDC
+	asr	I, M, #5
+	cmp 	I, #0
+	ble .Lgemm_beta_04
+	add	A02, A01, #32
+	add	A03, A02, #32
+	add	A04, A03, #32
+
+	.align 5
+.Lgemm_beta_03:
+
+	prfm	PLDL1KEEP, [A01, prfm_size]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+  
+	fmul v0.4s, v0.4s, betaV0
+	fmul v1.4s, v1.4s, betaV0
+	
+	fmul v2.4s, v2.4s, betaV0
+	fmul v3.4s, v3.4s, betaV0
+	
+	fmul v4.4s, v4.4s, betaV0
+	fmul v5.4s, v5.4s, betaV0
+	
+	fmul v6.4s, v6.4s, betaV0
+	fmul v7.4s, v7.4s, betaV0
+
+	prfm    PLDL1KEEP, [A01, prfm_size + 64]
+
+	st1	{v0.4s, v1.4s}, [A01]
+	add	A01, A01, calc_size
+	st1	{v2.4s, v3.4s}, [A02]
+	add	A02, A02, calc_size
+	st1	{v4.4s, v5.4s}, [A03]
+	add	A03, A03, calc_size
+	st1	{v6.4s, v7.4s}, [A04]
+	add	A04, A04, calc_size
+
+	subs	I , I , #1
+	bne	.Lgemm_beta_03
+
+	.align 5
+.Lgemm_beta_04:
+
+	and	I, M , #31
+	cmp	I, #0
+	ble	.Lgemm_beta_06
+
+	.align 5
+.Lgemm_beta_05:
+
+	ldr	s12, [A01]
+	fmul	s12, s12, beta0
+	str	s12, [A01]
+	add	A01, A01, #4
+
+	subs	I , I , #1
+	bne	.Lgemm_beta_05
+
+	.align 5
+.Lgemm_beta_06:
+
+	subs	N , N, #1			// N--
+	bne	.Lgemm_beta_02
+
+	.align 5
+.Lgemm_beta_L999:
+
+	mov	x0, #0
+	RESTORE_REGS
+	ret
+
+	.align 5
+.Lgemm_beta_zero_01:
+
+	INIT_ZERO
+	lsl LDC, LDC, #2
+
+	.align 5
+.Lgemm_beta_zero_02:
+
+	mov A01, C00
+	add C00, C00, LDC
+
+	asr I, M, #5
+	cmp I, #0
+	ble .Lgemm_beta_zero_04
+	add A02, A01, #32
+	add A03, A02, #32
+	add A04, A03, #32
+
+	.align 5
+.Lgemm_beta_zero_03:
+
+	st1	{v0.4s, v1.4s}, [A01]
+	add	 A01, A01, calc_size
+	st1	{v2.4s, v3.4s}, [A02]
+	add 	A02, A02, calc_size
+	st1	{v4.4s, v5.4s}, [A03]
+	add	A03, A03, calc_size
+	st1	{v6.4s, v7.4s}, [A04]
+	add	A04, A04, calc_size
+
+	subs I, I, #1
+	bne .Lgemm_beta_zero_03
+
+	.align 5
+.Lgemm_beta_zero_04:
+
+	and I, M, #31
+	cmp I, #0
+	ble .Lgemm_beta_zero_06
+
+	.align 5
+.Lgemm_beta_zero_05:
+
+	str beta0, [A01]
+	add A01, A01, #4
+
+	subs I, I, #1
+	bne .Lgemm_beta_zero_05
+
+	.align 5
+.Lgemm_beta_zero_06:
+
+	subs N, N, #1
+	bne .Lgemm_beta_zero_02
+
+	.align 5
+.Lgemm_beta_zero_L999:
+	mov x0, #0
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
--- a/kernel/arm64/sgemm_tcopy_16.S
+++ b/kernel/arm64/sgemm_tcopy_16.S
@ -0,0 +1,824 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M		x0
+#define	N		x1
+#define	A		x2
+#define	LDA		x3
+#define	B		x4
+
+#define M8		x5
+
+#define	A01		x6
+#define	A02		x7
+#define	A03		x8
+#define	A04		x9
+#define	A05		x10
+#define	A06		x11
+#define	A07		x12
+#define	A08		x13
+
+#define	B01		x14
+#define	B02		x15
+#define	B03		x16
+#define	B04		x17
+#define	B00		x22
+
+
+#define I		x18
+#define	J		x19
+
+#define TEMP1		x20
+
+#define A_PREFETCH	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x8
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+	//prfm	PSTL1KEEP, [B00, M8]
+	
+	ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add  A01, A01, #64
+	
+	st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add TEMP1, B00, #64
+
+	ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add  A02, A02, #64
+	
+	st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
+	add  A03, A03, #64
+	
+	st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
+	add  A04, A04, #64
+	
+	st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05]
+	add  A05, A05, #64
+	
+	st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06]
+	add  A06, A06, #64
+	
+	st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07]
+	add  A07, A07, #64
+	
+	st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08]
+	add  A08, A08, #64
+	
+	st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+	
+	add	B00, B00, M8
+
+.endm
+
+.macro COPY8x8
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+	
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+	
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+	add	A03, A03, #32
+	add	A04, A04, #32
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q8, q9, [A05]
+	ldp	q10, q11, [A06]
+	add	A05, A05, #32
+	add	A06, A06, #32
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q12, q13, [A07]
+	ldp	q14, q15, [A08]
+	add	A07, A07, #32
+	add	A08, A08, #32
+
+	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	ldr	q2, [A03]
+	ldr	q3, [A04]
+	add	A01, A01, #16
+	add	A02, A02, #16
+	add	A03, A03, #16
+	add	A04, A04, #16
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
+	add	B02, B02, #64
+
+	ldr	q4, [A05]
+	ldr	q5, [A06]
+	ldr	q6, [A07]
+	ldr	q7, [A08]
+
+	add	A05, A05, #16
+	add	A06, A06, #16
+	add	A07, A07, #16
+	add	A08, A08, #16
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B02]
+	add	B02, B02, #64
+.endm
+
+.macro COPY2x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	ldr	d2, [A03]
+	ldr	d3, [A04]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	add	A03, A03, #8
+	add	A04, A04, #8
+
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+	stp	d2, d3, [B03]
+	add	B03, B03, #16
+
+	ldr	d4, [A05]
+	ldr	d5, [A06]
+	ldr	d6, [A07]
+	ldr	d7, [A08]
+	
+	add	A05, A05, #8
+	add	A06, A06, #8
+	add	A07, A07, #8
+	add	A08, A08, #8
+
+	stp	d4, d5, [B03]
+	add	B03, B03, #16
+	stp	d6, d7, [B03]
+	add	B03, B03, #16
+
+.endm
+
+.macro COPY1x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	ldr	s2, [A03]
+	ldr	s3, [A04]
+	
+	add	A01, A01, #4
+	add	A02, A02, #4
+	add	A03, A03, #4
+	add	A04, A04, #4
+
+	stp	s0, s1, [B04]
+	add	B04, B04, #8
+	stp	s2, s3, [B04]
+	add	B04, B04, #8
+
+	ldr	s4, [A05]
+	ldr	s5, [A06]
+	ldr	s6, [A07]
+	ldr	s7, [A08]
+	
+	ldr	d4, [A05], #8
+	ldr	d5, [A06], #8
+	ldr	d6, [A07], #8
+	ldr	d7, [A08], #8
+
+	stp	s4, s5, [B04]
+	add	B04, B04, #8
+	stp	s6, s7, [B04]
+	add	B04, B04, #8
+
+.endm
+
+/*************************************************************************************************************************/
+.macro COPY16x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	TEMP1, B00, #64
+
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add	A02, A02, #64
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	ld1	{v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
+	add	A03, A03, #64
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	ld1	{v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
+	add	A04, A04, #64
+
+	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
+
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+	add	A03, A03, #32
+	add	A04, A04, #32
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	ldr	q2, [A03]
+	ldr	q3, [A04]
+	add	A01, A01, #16
+	add	A02, A02, #16
+	add	A03, A03, #16
+	add	A04, A04, #16
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
+
+	add	B02, B02, #64
+.endm
+
+.macro COPY2x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	ldr	d2, [A03]
+	ldr	d3, [A04]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	add	A03, A03, #8
+	add	A04, A04, #8
+	
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+	stp	d2, d3, [B03]
+
+	add	B03, B03, #16
+.endm
+
+.macro COPY1x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	ldr	s2, [A03]
+	ldr	s3, [A04]
+	
+	add	A01, A01, #4
+	add	A02, A02, #4
+	add	A03, A03, #4
+	add	A04, A04, #4
+
+	stp	s0, s1, [B04]
+	add	B04, B04, #8
+	stp	s2, s3, [B04]
+	add	B04, B04, #8
+
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+	
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add	A02, A02, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	TEMP1, B00, #64
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s}, [A01]
+	ld1	{v2.4s, v3.4s}, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	add	A01, A01, #16
+	add	A02, A02, #16
+
+	stp	q0, q1, [B02]
+	add	B02, B02, #32
+.endm
+
+.macro COPY2x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+.endm
+
+.macro COPY1x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	
+	add	A01, A01, #4
+	add	A02, A02, #4
+
+	stp	s0, s1, [B04]
+
+	add	B04, B04, #8
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	
+	ldp	q0, q1, [A01]
+	add	A01, A01, #32
+	stp	q0, q1, [B01]
+
+	add	B01, B01, #32
+.endm
+
+.macro COPY4x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr q0, [A01]
+	add	A01, A01, #16
+	str q0, [B02]
+
+	add	B02, B02, #16
+.endm
+
+.macro COPY2x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	add	A01, A01, #8
+	str d0, [B03]
+
+	add	B03, B03, #8
+.endm
+
+.macro COPY1x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	add	A01, A01, #4
+	str	s0, [B04]
+
+	add	B04, B04, #4
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	SAVE_REGS
+
+	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
+
+	lsl	TEMP1, M, #2					// TEMP1 = M * SIZE
+
+	and	B01 , N , #-16
+	and	B02 , N , #-8
+	and	B03 , N , #-4
+	and	B04 , N , #-2
+
+	mul	B01, B01, TEMP1
+	mul	B02, B02, TEMP1
+	mul	B03, B03, TEMP1
+	mul	B04, B04, TEMP1
+
+	add	B01 , B01, B
+	add	B02 , B02, B
+	add	B03 , B03, B
+	add	B04 , B04, B
+
+	lsl	M8, M, #6					// M8 = M * 16 * SIZE
+
+.Lsgemm_tcopy_L8_BEGIN:
+	asr 	J, M, #3					// J = M / 8
+	cmp 	J, #0
+	ble	.Lsgemm_tcopy_L4_BEGIN
+
+	.align	5
+.Lsgemm_tcopy_L8_M16_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A05, A04, LDA
+	add	A06, A05, LDA
+	add	A07, A06, LDA
+	add	A08, A07, LDA
+	add	A, A08, LDA
+
+	mov	B00, B
+	add	B, B00, #512					// B = B + 8 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L8_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L8_M16_20:
+
+	COPY16x8
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L8_M16_20
+
+.Lsgemm_tcopy_L8_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L8_M16_60
+
+	COPY8x8
+	
+.Lsgemm_tcopy_L8_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L8_M16_80
+
+	COPY4x8
+
+.Lsgemm_tcopy_L8_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L8_M16_100
+
+	COPY2x8
+
+.Lsgemm_tcopy_L8_M16_100:
+
+	tst	N, #1
+	ble	.Lsgemm_tcopy_L8_M16_END
+
+	COPY1x8
+
+.Lsgemm_tcopy_L8_M16_END:
+
+	subs	J , J, #1						// j--
+	bne	.Lsgemm_tcopy_L8_M16_BEGIN
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L4_BEGIN:
+	tst	M, #7
+	ble	.Lsgemm_tcopy_L999
+
+	tst	M, #4
+	ble	.Lsgemm_tcopy_L2_BEGIN
+	
+.Lsgemm_tcopy_L4_M16_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A, A04, LDA
+
+	mov	B00, B
+	add	B, B00, #256					// B = B + 4 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L4_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L4_M16_20:
+
+	COPY16x4
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L4_M16_20
+
+.Lsgemm_tcopy_L4_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L4_M16_60
+
+	COPY8x4
+	
+.Lsgemm_tcopy_L4_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L4_M16_80
+
+	COPY4x4
+
+.Lsgemm_tcopy_L4_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L4_M16_100
+
+	COPY2x4
+
+
+.Lsgemm_tcopy_L4_M16_100:
+
+	tst	N, #1
+	ble	.Lsgemm_tcopy_L4_M16_END
+
+	COPY1x4
+
+
+.Lsgemm_tcopy_L4_M16_END:
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L2_BEGIN:
+
+	tst	M, #3
+	ble	.Lsgemm_tcopy_L999
+
+	tst	M, #2
+	ble	.Lsgemm_tcopy_L1_BEGIN
+
+.Lsgemm_tcopy_L2_M16_BEGIN:
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A, A02, LDA
+
+	mov	B00, B
+	add	B, B00, #128					// B = B + 2 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L2_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L2_M16_20:
+
+	COPY16x2
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L2_M16_20
+
+.Lsgemm_tcopy_L2_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L2_M16_60
+
+	COPY8x2
+
+.Lsgemm_tcopy_L2_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L2_M16_80
+
+	COPY4x2
+
+.Lsgemm_tcopy_L2_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L2_M16_100
+
+	COPY2x2
+
+.Lsgemm_tcopy_L2_M16_100:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L2_M16_END
+
+	COPY1x2
+
+.Lsgemm_tcopy_L2_M16_END:
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L1_BEGIN:
+
+	tst	M, #1
+	ble	.Lsgemm_tcopy_L999
+
+
+.Lsgemm_tcopy_L1_M16_BEGIN:
+
+	mov	A01, A						// A01 = A
+	mov	B00, B
+
+	asr	I, N, #4					// I = M / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L1_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L1_M16_20:
+
+	COPY16x1
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L1_M16_20
+	
+.Lsgemm_tcopy_L1_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L1_M16_60
+
+	COPY8x1
+
+.Lsgemm_tcopy_L1_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L1_M16_80
+
+	COPY4x1
+
+.Lsgemm_tcopy_L1_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L1_M16_100
+
+	COPY2x1
+
+.Lsgemm_tcopy_L1_M16_100:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L1_M16_END
+
+	COPY1x1
+
+
+.Lsgemm_tcopy_L1_M16_END:
+
+.Lsgemm_tcopy_L999:
+	mov	x0, #0						// set return value
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
+
+
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@ -739,6 +739,26 @@ static void init_parameter(void) {
 }
 #else //POWER

+#if defined(ARCH_ZARCH)
+static void init_parameter(void) {
+	TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+	TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+	TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+	TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+
+	TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
+	TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
+	TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
+	TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
+
+
+	TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
+	TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
+	TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
+	TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
+}
+#else //ZARCH
+
 #ifdef ARCH_X86
 static int get_l2_size_old(void){
  int i, eax, ebx, ecx, edx, cpuid_level;
@ -1325,4 +1345,5 @@ static void init_parameter(void) {

 }
 #endif //POWER
+#endif //ZARCH
 #endif //defined(ARCH_ARM64)
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@ -98,5 +98,5 @@ ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c

 CGEMM3MKERNEL    =  cgemm3m_kernel_8x4_haswell.c
-ZGEMM3MKERNEL    =  zgemm3m_kernel_2x8_nehalem.S
+ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_haswell.c

--- a/kernel/x86_64/KERNEL.ZEN
+++ b/kernel/x86_64/KERNEL.ZEN
@ -95,5 +95,5 @@ ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c

 CGEMM3MKERNEL    =  cgemm3m_kernel_8x4_haswell.c
-ZGEMM3MKERNEL    =  zgemm3m_kernel_2x8_nehalem.S
+ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_haswell.c

--- a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c
+++ b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c
@ -0,0 +1,224 @@
+/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */
+/* r12 = k << 5(const), r13 = k(const), r14 = b_head_pos(const), r15 = tmp */
+
+#include "common.h"
+#include <stdint.h>
+
+//recommended settings: GEMM_Q=256, GEMM_P=256
+
+/* m = 4 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
+#define KERNEL_k1m4n1 \
+    "vmovupd (%0),%%ymm1; addq $32,%0;"\
+    "vbroadcastsd (%1),%%ymm2; vfmadd231pd %%ymm1,%%ymm2,%%ymm4;"\
+    "addq $8,%1;"
+#define KERNEL_h_k1m4n2 \
+    "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;"\
+    "vbroadcastf128 (%1),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm4; vfmadd231pd %%ymm2,%%ymm3,%%ymm5;"
+#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
+#define KERNEL_h_k1m4n4 \
+    KERNEL_h_k1m4n2 "vbroadcastf128 16(%1),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm6; vfmadd231pd %%ymm2,%%ymm3,%%ymm7;"
+#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $32,%1;"
+#define unit_kernel_k1m4n4(c1,c2,c3,c4,off1,off2,...) \
+    "vbroadcastf128 "#off1"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\
+    "vbroadcastf128 "#off2"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";"
+#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,16,%1,%%r12,1)
+#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $32,%1;"
+#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,16,%1,%%r12,2)
+#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $32,%1;"
+#define KERNEL_k2m4n1 KERNEL_k1m4n1 KERNEL_k1m4n1
+#define KERNEL_k2m4n2 KERNEL_k1m4n2 KERNEL_k1m4n2
+#define KERNEL_k2m4n4 KERNEL_k1m4n4 KERNEL_k1m4n4
+#define KERNEL_k2m4n8 KERNEL_k1m4n8 KERNEL_k1m4n8
+#define KERNEL_k2m4n12 \
+    "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;"\
+    unit_kernel_k1m4n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,16,%1)\
+    unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,16,%1,%%r12,1)\
+    unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,16,%1,%%r12,2)\
+    "vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2; prefetcht0 512(%0); addq $64,%0;"\
+    unit_kernel_k1m4n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,32,48,%1)\
+    unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,32,48,%1,%%r12,1)\
+    unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,32,48,%1,%%r12,2) "addq $64,%1;"
+#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
+#define INIT_m4n2 INIT_m4n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m4n4 INIT_m4n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
+#define unit_init_m4n4(c1,c2,c3,c4) \
+    "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
+#define INIT_m4n8  INIT_m4n4 unit_init_m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
+#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
+#define SAVE_h_m4n1 \
+    "vpermpd $216,%%ymm4,%%ymm3; vunpcklpd %%ymm3,%%ymm3,%%ymm1; vunpckhpd %%ymm3,%%ymm3,%%ymm2;"\
+    "vfmadd213pd (%2),%%ymm0,%%ymm1; vfmadd213pd 32(%2),%%ymm0,%%ymm2; vmovupd %%ymm1,(%2); vmovupd %%ymm2,32(%2);"
+#define unit_save_m4n2(c1,c2) \
+    "vperm2f128 $2,"#c1","#c2",%%ymm2; vperm2f128 $19,"#c1","#c2","#c2"; vmovapd %%ymm2,"#c1";"\
+    "vunpcklpd "#c1","#c1",%%ymm2; vunpcklpd "#c2","#c2",%%ymm3;"\
+    "vfmadd213pd (%5),%%ymm0,%%ymm2; vfmadd213pd 32(%5),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5); vmovupd %%ymm3,32(%5);"\
+    "vunpckhpd "#c1","#c1",%%ymm2; vunpckhpd "#c2","#c2",%%ymm3;"\
+    "vfmadd213pd (%5,%3,1),%%ymm0,%%ymm2; vfmadd213pd 32(%5,%3,1),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5,%3,1); vmovupd %%ymm3,32(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"
+#define SAVE_h_m4n2 "movq %2,%5;" unit_save_m4n2(%%ymm4,%%ymm5)
+#define SAVE_h_m4n4  SAVE_h_m4n2  unit_save_m4n2(%%ymm6,%%ymm7)
+#define SAVE_h_m4n8  SAVE_h_m4n4  unit_save_m4n2(%%ymm8,%%ymm9)   unit_save_m4n2(%%ymm10,%%ymm11)
+#define SAVE_h_m4n12 SAVE_h_m4n8  unit_save_m4n2(%%ymm12,%%ymm13) unit_save_m4n2(%%ymm14,%%ymm15)
+#define SAVE_m4(ndim) SAVE_h_m4n##ndim "addq $64,%2;"
+#define COMPUTE_m4(ndim) \
+    INIT_m4n##ndim\
+    "movq %%r13,%4; movq %%r14,%1; movq %2,%5; xorq %%r15,%%r15;"\
+    "cmpq $24,%4; jb "#ndim"004042f;"\
+    #ndim"004041:\n\t"\
+    "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\
+    KERNEL_k2m4n##ndim KERNEL_k2m4n##ndim\
+    "prefetcht1 (%5); subq $63,%5;"\
+    KERNEL_k2m4n##ndim KERNEL_k2m4n##ndim\
+    "addq %%r15,%5; prefetcht1 (%8); addq $32,%8;"\
+    "subq $8,%4; cmpq $16,%4; jnb "#ndim"004041b;"\
+    "movq %2,%5;"\
+    #ndim"004042:\n\t"\
+    "testq %4,%4; jz "#ndim"004043f;"\
+    "prefetcht0 (%5); prefetcht0 63(%5);"\
+    KERNEL_k1m4n##ndim\
+    "prefetcht0 (%5,%3,4); prefetcht0 63(%5,%3,4); addq %3,%5;"\
+    "decq %4; jmp "#ndim"004042b;"\
+    #ndim"004043:\n\t"\
+    "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
+    SAVE_m4(ndim)
+
+/* m = 2 *//* vmm0 for alpha, vmm1-vmm3 for temporary use, vmm4-vmm9 for accumulators */
+#define KERNEL_k1m2n1 \
+    "vmovupd (%0),%%xmm1; addq $16,%0;"\
+    "vmovddup (%1),%%xmm2; vfmadd231pd %%xmm1,%%xmm2,%%xmm4;"\
+    "addq $8,%1;"
+#define KERNEL_h_k1m2n2 \
+    "vmovddup (%0),%%xmm1; vmovddup 8(%0),%%xmm2; addq $16,%0;"\
+    "vmovupd (%1),%%xmm3; vfmadd231pd %%xmm1,%%xmm3,%%xmm4; vfmadd231pd %%xmm2,%%xmm3,%%xmm5;"
+#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
+#define unit_kernel_k1m2n4(c1,c2,...) \
+    "vmovupd ("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"
+#define KERNEL_h_k1m2n4 \
+    "vbroadcastsd (%0),%%ymm1; vbroadcastsd 8(%0),%%ymm2; addq $16,%0;"\
+    unit_kernel_k1m2n4(%%ymm4,%%ymm5,%1)
+#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;"
+#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 \
+    unit_kernel_k1m2n4(%%ymm6,%%ymm7,%1,%%r12,1)
+#define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $32,%1;"
+#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n8 \
+    unit_kernel_k1m2n4(%%ymm8,%%ymm9,%1,%%r12,2)
+#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $32,%1;"
+#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define unit_init_m2n4(c1,c2) "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";"
+#define INIT_m2n4 unit_init_m2n4(%%ymm4,%%ymm5)
+#define INIT_m2n8 INIT_m2n4 unit_init_m2n4(%%ymm6,%%ymm7)
+#define INIT_m2n12 INIT_m2n8 unit_init_m2n4(%%ymm8,%%ymm9)
+#define SAVE_h_m2n1 \
+    "vinsertf128 $1,%%xmm4,%%ymm4,%%ymm4; vpermilpd $12,%%ymm4,%%ymm4; vfmadd213pd (%2),%%ymm0,%%ymm4; vmovupd %%ymm4,(%2);"
+#define SAVE_h_m2n2 \
+    "vinsertf128 $1,%%xmm5,%%ymm4,%%ymm4; vunpcklpd %%ymm4,%%ymm4,%%ymm1; vunpckhpd %%ymm4,%%ymm4,%%ymm2;"\
+    "vfmadd213pd (%2),%%ymm0,%%ymm1; vmovupd %%ymm1,(%2);"\
+    "vfmadd213pd (%2,%3,1),%%ymm0,%%ymm2; vmovupd %%ymm2,(%2,%3,1);"
+#define unit_save_m2n4(c1,c2) \
+    "vperm2f128 $2,"#c1","#c2",%%ymm1; vunpcklpd %%ymm1,%%ymm1,%%ymm2; vunpckhpd %%ymm1,%%ymm1,%%ymm3;"\
+    "vfmadd213pd (%5),%%ymm0,%%ymm2; vfmadd213pd (%5,%3,1),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5); vmovupd %%ymm3,(%5,%3,1); leaq (%5,%3,2),%5;"\
+    "vperm2f128 $19,"#c1","#c2",%%ymm1; vunpcklpd %%ymm1,%%ymm1,%%ymm2; vunpckhpd %%ymm1,%%ymm1,%%ymm3;"\
+    "vfmadd213pd (%5),%%ymm0,%%ymm2; vfmadd213pd (%5,%3,1),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5); vmovupd %%ymm3,(%5,%3,1); leaq (%5,%3,2),%5;"
+#define SAVE_h_m2n4 "movq %2,%5;" unit_save_m2n4(%%ymm4,%%ymm5)
+#define SAVE_h_m2n8 SAVE_h_m2n4 unit_save_m2n4(%%ymm6,%%ymm7)
+#define SAVE_h_m2n12 SAVE_h_m2n8 unit_save_m2n4(%%ymm8,%%ymm9)
+#define SAVE_m2(ndim) SAVE_h_m2n##ndim "addq $32,%2;"
+#define COMPUTE_m2(ndim) \
+    INIT_m2n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim"002022:\n\t"\
+    "testq %4,%4; jz "#ndim"002023f;"\
+    KERNEL_k1m2n##ndim\
+    "decq %4; jmp "#ndim"002022b;"\
+    #ndim"002023:\n\t"\
+    SAVE_m2(ndim)
+
+/* m = 1 *//* vmm0 for alpha, vmm1-vmm3 and vmm10-vmm15 for temporary use, vmm4-vmm6 for accumulators */
+#define KERNEL_k1m1n1 \
+    "vmovsd (%0),%%xmm1; addq $8,%0;"\
+    "vfmadd231sd (%1),%%xmm1,%%xmm4; addq $8,%1;"
+#define KERNEL_k1m1n2 \
+    "vmovddup (%0),%%xmm1; addq $8,%0;"\
+    "vfmadd231pd (%1),%%xmm1,%%xmm4; addq $16,%1;"
+#define unit_kernel_k1m1n4(c1,...) \
+    "vmovupd ("#__VA_ARGS__"),%%ymm2; vfmadd231pd %%ymm1,%%ymm2,"#c1";"
+#define KERNEL_h_k1m1n4 \
+    "vbroadcastsd (%0),%%ymm1; addq $8,%0;"\
+    unit_kernel_k1m1n4(%%ymm4,%1)
+#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;"
+#define KERNEL_h_k1m1n8 KERNEL_h_k1m1n4 unit_kernel_k1m1n4(%%ymm5,%1,%%r12,1)
+#define KERNEL_k1m1n8 KERNEL_h_k1m1n8 "addq $32,%1;"
+#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n8 unit_kernel_k1m1n4(%%ymm6,%1,%%r12,2)
+#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $32,%1;"
+#define INIT_m1n1 INIT_m2n1
+#define INIT_m1n2 INIT_m2n1
+#define INIT_m1n4 "vpxor %%ymm4,%%ymm4,%%ymm4;"
+#define INIT_m1n8 INIT_m1n4 "vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m1n12 INIT_m1n8 "vpxor %%ymm6,%%ymm6,%%ymm6;"
+#define SAVE_h_m1n1 \
+    "vmovddup %%xmm4,%%xmm4; vfmadd213pd (%2),%%xmm0,%%xmm4; vmovupd %%xmm4,(%2);"
+#define SAVE_h_m1n2 \
+    "vunpcklpd %%xmm4,%%xmm4,%%xmm1; vunpckhpd %%xmm4,%%xmm4,%%xmm2;"\
+    "vfmadd213pd (%2),%%xmm0,%%xmm1; vmovupd %%xmm1,(%2);"\
+    "vfmadd213pd (%2,%3,1),%%xmm0,%%xmm2; vmovupd %%xmm2,(%2,%3,1);"
+#define unit_save_m1n4(c1) \
+    "vunpcklpd "#c1","#c1",%%ymm1; vunpckhpd "#c1","#c1",%%ymm2;"\
+    "vmovupd (%5),%%xmm3; vinsertf128 $1,(%5,%3,2),%%ymm3,%%ymm3;"\
+    "vfmadd213pd %%ymm3,%%ymm0,%%ymm1; vmovupd %%xmm1,(%5); vextractf128 $1,%%ymm1,(%5,%3,2); addq %3,%5;"\
+    "vmovupd (%5),%%xmm3; vinsertf128 $1,(%5,%3,2),%%ymm3,%%ymm3;"\
+    "vfmadd213pd %%ymm3,%%ymm0,%%ymm2; vmovupd %%xmm2,(%5); vextractf128 $1,%%ymm2,(%5,%3,2); addq %3,%5; leaq (%5,%3,2),%5;"
+#define SAVE_h_m1n4 "movq %2,%5;" unit_save_m1n4(%%ymm4)
+#define SAVE_h_m1n8 SAVE_h_m1n4 unit_save_m1n4(%%ymm5)
+#define SAVE_h_m1n12 SAVE_h_m1n8 unit_save_m1n4(%%ymm6)
+#define SAVE_m1(ndim) SAVE_h_m1n##ndim "addq $16,%2;"
+#define COMPUTE_m1(ndim) \
+    INIT_m1n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim"001011:\n\t"\
+    "testq %4,%4; jz "#ndim"001012f;"\
+    KERNEL_k1m1n##ndim\
+    "decq %4; jmp "#ndim"001011b;"\
+    #ndim"001012:\n\t"\
+    SAVE_m1(ndim)
+
+#define COMPUTE(ndim) {\
+    next_b = b_pointer + ndim * K;\
+    __asm__ __volatile__(\
+    "vbroadcastf128 (%6),%%ymm0;"\
+    "movq %4,%%r13; movq %4,%%r12; salq $5,%%r12; movq %1,%%r14; movq %7,%%r11;"\
+    "cmpq $4,%7;jb 33101"#ndim"f;"\
+    "33109"#ndim":\n\t"\
+    COMPUTE_m4(ndim)\
+    "subq $4,%7;cmpq $4,%7;jnb 33109"#ndim"b;"\
+    "33101"#ndim":\n\t"\
+    "cmpq $2,%7;jb 33104"#ndim"f;"\
+    COMPUTE_m2(ndim)\
+    "subq $2,%7;"\
+    "33104"#ndim":\n\t"\
+    "testq %7,%7;jz 33105"#ndim"f;"\
+    COMPUTE_m1(ndim)\
+    "33105"#ndim":\n\t"\
+    "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
+    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\
+    ::"r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14",\
+    "xmm15","cc","memory");\
+    a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\
+}
+int __attribute__ ((noinline))
+CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC)
+{
+    if(m==0||n==0||k==0) return 0;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2;
+    double constval[2]; constval[0] = alphar; constval[1] = alphai;
+    double *const_val=constval;
+    int64_t M = (int64_t)m, K = (int64_t)k;
+    BLASLONG n_count = n;
+    double *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
+    for(;n_count>11;n_count-=12) COMPUTE(12)
+    for(;n_count>7;n_count-=8) COMPUTE(8)
+    for(;n_count>3;n_count-=4) COMPUTE(4)
+    for(;n_count>1;n_count-=2) COMPUTE(2)
+    if(n_count>0) COMPUTE(1)
+    return 0;
+}
--- a/kernel/zarch/KERNEL.Z13
+++ b/kernel/zarch/KERNEL.Z13
@ -96,10 +96,10 @@ SGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ = sgemm_incopy.o
-SGEMMITCOPYOBJ = sgemm_itcopy.o
-SGEMMONCOPYOBJ = sgemm_oncopy.o
-SGEMMOTCOPYOBJ = sgemm_otcopy.o
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)


 
@ -108,16 +108,16 @@ DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ = dgemm_incopy.o
-DGEMMITCOPYOBJ = dgemm_itcopy.o
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)

 CGEMMKERNEL    = ctrmm4x4V.S
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)

 ZGEMMKERNEL    = ztrmm4x4V.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
--- a/kernel/zarch/KERNEL.Z14
+++ b/kernel/zarch/KERNEL.Z14
@ -96,10 +96,10 @@ SGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ = sgemm_incopy.o
-SGEMMITCOPYOBJ = sgemm_itcopy.o
-SGEMMONCOPYOBJ = sgemm_oncopy.o
-SGEMMOTCOPYOBJ = sgemm_otcopy.o
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)


 
@ -108,16 +108,16 @@ DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ = dgemm_incopy.o
-DGEMMITCOPYOBJ = dgemm_itcopy.o
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)

 CGEMMKERNEL    = ctrmm4x4V.S
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)

 ZGEMMKERNEL    = ztrmm4x4V.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
--- a/kernel/zarch/KERNEL.ZARCH_GENERIC
+++ b/kernel/zarch/KERNEL.ZARCH_GENERIC
@ -94,26 +94,26 @@ ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
 SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
 SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)

 DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
 DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)

 CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)

 ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)

 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
--- a/lapack-netlib/.appveyor.yml
+++ b/lapack-netlib/.appveyor.yml
@ -0,0 +1,38 @@
+image:
+- Visual Studio 2017
+
+configuration: Release
+clone_depth: 3
+
+matrix:
+  fast_finish: false
+
+skip_commits:
+# Add [av skip] to commit messages
+  message: /\[av skip\]/
+
+cache:
+  - '%APPVEYOR_BUILD_FOLDER%\build'
+
+environment:
+  global:
+    CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
+
+install:
+  - call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
+  - conda config --add channels conda-forge --force
+  - conda install --yes --quiet flang jom
+  - call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
+  - set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
+  - set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"
+
+before_build:
+  - ps: if (-Not (Test-Path .\build)) { mkdir build }
+  - cd build
+  - cmake -G "NMake Makefiles JOM" -DCMAKE_Fortran_COMPILER=flang -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=ON ..
+
+build_script:
+  - cmake --build .
+
+test_script:
+  - ctest -j2
--- a/lapack-netlib/.gitignore
+++ b/lapack-netlib/.gitignore
@ -35,3 +35,9 @@ LAPACKE/example/xexample*
 # SED
 SRC/*-e
 LAPACKE/src/*-e
+build*
+
+# DOCS documentation
+DOCS/man
+DOCS/explore-html
+output_err
--- a/lapack-netlib/.travis.yml
+++ b/lapack-netlib/.travis.yml
@ -1,33 +1,32 @@
-language: cpp
+language: c
+dist: xenial
+group: travis_latest
+
+git:
+  depth: 3
+  quiet: true

 addons:
  apt:
-    sources:
-      - george-edison55-precise-backports # cmake
    packages:
-      - cmake
-      - cmake-data
-      - gfortran
+    - gfortran

-os:
-  - linux
-  - osx
-
-env:
-  - CMAKE_BUILD_TYPE=Release
-  - CMAKE_BUILD_TYPE=Coverage
-
-install:
-  - if [[ "$TRAVIS_OS_NAME" == "osx" ]];
-    then
-      for pkg in gcc cmake; do
-        if brew list -1 | grep -q "^${pkg}\$"; then
-          brew outdated $pkg || brew upgrade $pkg;
-        else
-          brew install $pkg;
-        fi
-      done
-    fi
+matrix:
+  include:
+  - os: linux
+    env: CMAKE_BUILD_TYPE=Release
+  - os: linux
+    env: CMAKE_BUILD_TYPE=Coverage
+  - os: osx
+    env: CMAKE_BUILD_TYPE=Release
+    before_install:
+    - brew update > /dev/null
+    - brew install gcc > /dev/null
+  - os: osx
+    env: CMAKE_BUILD_TYPE=Coverage
+    before_install:
+    - brew update > /dev/null
+    - brew install gcc > /dev/null

 script:
  - export PR=https://api.github.com/repos/$TRAVIS_REPO_SLUG/pulls/$TRAVIS_PULL_REQUEST
--- a/lapack-netlib/BLAS/CMakeLists.txt
+++ b/lapack-netlib/BLAS/CMakeLists.txt
@ -6,4 +6,5 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/blas.pc.in ${CMAKE_CURRENT_BINARY_DIR
 install(FILES
  ${CMAKE_CURRENT_BINARY_DIR}/blas.pc
  DESTINATION ${PKG_CONFIG_DIR}
+  COMPONENT Development
  )
--- a/lapack-netlib/BLAS/Makefile
+++ b/lapack-netlib/BLAS/Makefile
@ -1,13 +1,18 @@
-include ../make.inc
+TOPSRCDIR = ..
+include $(TOPSRCDIR)/make.inc

+.PHONY: all
 all: blas

+.PHONY: blas
 blas:
 	$(MAKE) -C SRC

+.PHONY: blas_testing
 blas_testing: blas
 	$(MAKE) -C TESTING run

+.PHONY: clean cleanobj cleanlib cleanexe cleantest
 clean:
 	$(MAKE) -C SRC clean
 	$(MAKE) -C TESTING clean
--- a/lapack-netlib/BLAS/SRC/Makefile
+++ b/lapack-netlib/BLAS/SRC/Makefile
@ -1,5 +1,3 @@
-include ../../make.inc
-
 #######################################################################
 #  This is the makefile to create a library for the BLAS.
 #  The files are grouped as follows:
@ -55,6 +53,10 @@ include ../../make.inc
 #
 #######################################################################

+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
+
+.PHONY: all
 all: $(BLASLIB)

 #---------------------------------------------------------
@ -138,33 +140,32 @@ ALLOBJ = $(SBLAS1) $(SBLAS2) $(SBLAS3) $(DBLAS1) $(DBLAS2) $(DBLAS3) \
 	$(ZBLAS2) $(ZBLAS3) $(ALLBLAS)

 $(BLASLIB): $(ALLOBJ)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@

+.PHONY: single double complex complex16
 single: $(SBLAS1) $(ALLBLAS) $(SBLAS2) $(SBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)

 double: $(DBLAS1) $(ALLBLAS) $(DBLAS2) $(DBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)

 complex: $(CBLAS1) $(CB1AUX) $(ALLBLAS) $(CBLAS2) $(CBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)

 complex16: $(ZBLAS1) $(ZB1AUX) $(ALLBLAS) $(ZBLAS2) $(ZBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)

 FRC:
 	@FRC=$(FRC)

+.PHONY: clean cleanobj cleanlib
 clean: cleanobj cleanlib
 cleanobj:
 	rm -f *.o
 cleanlib:
 	#rm -f $(BLASLIB)  # May point to a system lib, e.g. -lblas
-
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
--- a/lapack-netlib/BLAS/SRC/icamax.f
+++ b/lapack-netlib/BLAS/SRC/icamax.f
@ -43,7 +43,7 @@
 *> \param[in] INCX
 *> \verbatim
 *>          INCX is INTEGER
-*>         storage spacing between elements of SX
+*>         storage spacing between elements of CX
 *> \endverbatim
 *
 *  Authors:
--- a/lapack-netlib/BLAS/SRC/idamax.f
+++ b/lapack-netlib/BLAS/SRC/idamax.f
@ -43,7 +43,7 @@
 *> \param[in] INCX
 *> \verbatim
 *>          INCX is INTEGER
-*>         storage spacing between elements of SX
+*>         storage spacing between elements of DX
 *> \endverbatim
 *
 *  Authors:
--- a/lapack-netlib/BLAS/SRC/izamax.f
+++ b/lapack-netlib/BLAS/SRC/izamax.f
@ -43,7 +43,7 @@
 *> \param[in] INCX
 *> \verbatim
 *>          INCX is INTEGER
-*>         storage spacing between elements of SX
+*>         storage spacing between elements of ZX
 *> \endverbatim
 *
 *  Authors:
--- a/lapack-netlib/BLAS/SRC/meson.build
+++ b/lapack-netlib/BLAS/SRC/meson.build
@ -0,0 +1,29 @@
+SBLAS1 = files('isamax.f', 'sasum.f', 'saxpy.f', 'scopy.f', 'sdot.f', 'snrm2.f', 'srot.f', 'srotg.f', 'sscal.f', 'sswap.f', 'sdsdot.f', 'srotmg.f', 'srotm.f')
+
+CBLAS1 = files('scabs1.f', 'scasum.f', 'scnrm2.f', 'icamax.f', 'caxpy.f', 'ccopy.f', 'cdotc.f', 'cdotu.f', 'csscal.f', 'crotg.f', 'cscal.f', 'cswap.f', 'csrot.f')
+
+DBLAS1 = files('idamax.f', 'dasum.f', 'daxpy.f', 'dcopy.f', 'ddot.f', 'dnrm2.f', 'drot.f', 'drotg.f', 'dscal.f', 'dsdot.f', 'dswap.f', 'drotmg.f', 'drotm.f')
+
+ZBLAS1 = files('dcabs1.f', 'dzasum.f', 'dznrm2.f', 'izamax.f', 'zaxpy.f', 'zcopy.f', 'zdotc.f', 'zdotu.f', 'zdscal.f', 'zrotg.f', 'zscal.f', 'zswap.f', 'zdrot.f')
+
+CB1AUX = files('isamax.f', 'sasum.f', 'saxpy.f', 'scopy.f', 'snrm2.f', 'sscal.f')
+
+ZB1AUX = files('idamax.f', 'dasum.f', 'daxpy.f', 'dcopy.f', 'dnrm2.f', 'dscal.f')
+
+ALLBLAS = files('lsame.f', 'xerbla.f', 'xerbla_array.f')
+
+SBLAS2 = files('sgemv.f', 'sgbmv.f', 'ssymv.f', 'ssbmv.f', 'sspmv.f', 'strmv.f', 'stbmv.f', 'stpmv.f', 'strsv.f', 'stbsv.f', 'stpsv.f', 'sger.f', 'ssyr.f', 'sspr.f', 'ssyr2.f', 'sspr2.f')
+
+CBLAS2 = files('cgemv.f', 'cgbmv.f', 'chemv.f', 'chbmv.f', 'chpmv.f', 'ctrmv.f', 'ctbmv.f', 'ctpmv.f', 'ctrsv.f', 'ctbsv.f', 'ctpsv.f', 'cgerc.f', 'cgeru.f', 'cher.f', 'chpr.f', 'cher2.f', 'chpr2.f')
+
+DBLAS2 = files('dgemv.f', 'dgbmv.f', 'dsymv.f', 'dsbmv.f', 'dspmv.f', 'dtrmv.f', 'dtbmv.f', 'dtpmv.f', 'dtrsv.f', 'dtbsv.f', 'dtpsv.f', 'dger.f', 'dsyr.f', 'dspr.f', 'dsyr2.f', 'dspr2.f')
+
+ZBLAS2 = files('zgemv.f', 'zgbmv.f', 'zhemv.f', 'zhbmv.f', 'zhpmv.f', 'ztrmv.f', 'ztbmv.f', 'ztpmv.f', 'ztrsv.f', 'ztbsv.f', 'ztpsv.f', 'zgerc.f', 'zgeru.f', 'zher.f', 'zhpr.f', 'zher2.f', 'zhpr2.f')
+
+SBLAS3 = files('sgemm.f', 'ssymm.f', 'ssyrk.f', 'ssyr2k.f', 'strmm.f', 'strsm.f')
+
+CBLAS3 = files('cgemm.f', 'csymm.f', 'csyrk.f', 'csyr2k.f', 'ctrmm.f', 'ctrsm.f', 'chemm.f', 'cherk.f', 'cher2k.f')
+
+DBLAS3 = files('dgemm.f', 'dsymm.f', 'dsyrk.f', 'dsyr2k.f', 'dtrmm.f', 'dtrsm.f')
+
+ZBLAS3 = files('zgemm.f', 'zsymm.f', 'zsyrk.f', 'zsyr2k.f', 'ztrmm.f', 'ztrsm.f', 'zhemm.f', 'zherk.f', 'zher2k.f')
--- a/lapack-netlib/BLAS/SRC/sdsdot.f
+++ b/lapack-netlib/BLAS/SRC/sdsdot.f
@ -23,13 +23,13 @@
 *>
 *> \verbatim
 *>
-*    Compute the inner product of two vectors with extended
-*    precision accumulation.
-*
-*    Returns S.P. result with dot product accumulated in D.P.
-*    SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY),
-*    where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is
-*    defined in a similar way using INCY.
+*>   Compute the inner product of two vectors with extended
+*>   precision accumulation.
+*>
+*>   Returns S.P. result with dot product accumulated in D.P.
+*>   SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY),
+*>   where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is
+*>   defined in a similar way using INCY.
 *> \endverbatim
 *
 *  Arguments:
@ -77,7 +77,14 @@
 *> \author Lawson, C. L., (JPL), Hanson, R. J., (SNLA),
 *> \author Kincaid, D. R., (U. of Texas), Krogh, F. T., (JPL)
 *
-*> \ingroup complex_blas_level1
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2017
+*
+*> \ingroup single_blas_level1
 *
 *> \par Further Details:
 *  =====================
@ -102,65 +109,7 @@
 *>    920501  Reformatted the REFERENCES section.  (WRB)
 *>    070118  Reformat to LAPACK coding style
 *> \endverbatim
-*
-*    =====================================================================
-*
-*       .. Local Scalars ..
-*       DOUBLE PRECISION DSDOT
-*       INTEGER I,KX,KY,NS
-*       ..
-*       .. Intrinsic Functions ..
-*       INTRINSIC DBLE
-*       ..
-*       DSDOT = SB
-*       IF (N.LE.0) THEN
-*          SDSDOT = DSDOT
-*          RETURN
-*       END IF
-*       IF (INCX.EQ.INCY .AND. INCX.GT.0) THEN
-*
-*       Code for equal and positive increments.
-*
-*          NS = N*INCX
-*          DO I = 1,NS,INCX
-*             DSDOT = DSDOT + DBLE(SX(I))*DBLE(SY(I))
-*          END DO
-*       ELSE
-*
-*       Code for unequal or nonpositive increments.
-*
-*          KX = 1
-*          KY = 1
-*          IF (INCX.LT.0) KX = 1 + (1-N)*INCX
-*          IF (INCY.LT.0) KY = 1 + (1-N)*INCY
-*          DO I = 1,N
-*             DSDOT = DSDOT + DBLE(SX(KX))*DBLE(SY(KY))
-*             KX = KX + INCX
-*             KY = KY + INCY
-*          END DO
-*       END IF
-*       SDSDOT = DSDOT
-*       RETURN
-*       END
-*
-*> \par Purpose:
-*  =============
 *>
-*> \verbatim
-*> \endverbatim
-*
-*  Authors:
-*  ========
-*
-*> \author Univ. of Tennessee
-*> \author Univ. of California Berkeley
-*> \author Univ. of Colorado Denver
-*> \author NAG Ltd.
-*
-*> \date November 2017
-*
-*> \ingroup single_blas_level1
-*
 *  =====================================================================
      REAL FUNCTION SDSDOT(N,SB,SX,INCX,SY,INCY)
 *
@ -175,71 +124,6 @@
 *     ..
 *     .. Array Arguments ..
      REAL SX(*),SY(*)
-*     ..
-*
-*  PURPOSE
-*  =======
-*
-*  Compute the inner product of two vectors with extended
-*  precision accumulation.
-*
-*  Returns S.P. result with dot product accumulated in D.P.
-*  SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY),
-*  where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is
-*  defined in a similar way using INCY.
-*
-*  AUTHOR
-*  ======
-*  Lawson, C. L., (JPL), Hanson, R. J., (SNLA),
-*  Kincaid, D. R., (U. of Texas), Krogh, F. T., (JPL)
-*
-*  ARGUMENTS
-*  =========
-*
-*  N      (input) INTEGER
-*         number of elements in input vector(s)
-*
-*  SB     (input) REAL
-*         single precision scalar to be added to inner product
-*
-*  SX     (input) REAL array, dimension (N)
-*         single precision vector with N elements
-*
-*  INCX   (input) INTEGER
-*         storage spacing between elements of SX
-*
-*  SY     (input) REAL array, dimension (N)
-*         single precision vector with N elements
-*
-*  INCY   (input) INTEGER
-*         storage spacing between elements of SY
-*
-*  SDSDOT (output) REAL
-*         single precision dot product (SB if N .LE. 0)
-*
-*  Further Details
-*  ===============
-*
-*  REFERENCES
-*
-*  C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T.
-*  Krogh, Basic linear algebra subprograms for Fortran
-*  usage, Algorithm No. 539, Transactions on Mathematical
-*  Software 5, 3 (September 1979), pp. 308-323.
-*
-*  REVISION HISTORY  (YYMMDD)
-*
-*  791001  DATE WRITTEN
-*  890531  Changed all specific intrinsics to generic.  (WRB)
-*  890831  Modified array declarations.  (WRB)
-*  890831  REVISION DATE from Version 3.2
-*  891214  Prologue converted to Version 4.0 format.  (BAB)
-*  920310  Corrected definition of LX in DESCRIPTION.  (WRB)
-*  920501  Reformatted the REFERENCES section.  (WRB)
-*  070118  Reformat to LAPACK coding style
-*
-*  =====================================================================
-*
 *     .. Local Scalars ..
      DOUBLE PRECISION DSDOT
      INTEGER I,KX,KY,NS
--- a/lapack-netlib/BLAS/TESTING/Makefile
+++ b/lapack-netlib/BLAS/TESTING/Makefile
@ -1,5 +1,7 @@
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc

+.PHONY: all single double complex complex16
 all: single double complex complex16
 single:    xblat1s xblat2s xblat3s
 double:    xblat1d xblat2d xblat3d
@ -7,32 +9,33 @@ complex:   xblat1c xblat2c xblat3c
 complex16: xblat1z xblat2z xblat3z

 xblat1s: sblat1.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat1d: dblat1.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat1c: cblat1.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat1z: zblat1.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

 xblat2s: sblat2.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat2d: dblat2.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat2c: cblat2.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat2z: zblat2.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

 xblat3s: sblat3.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat3d: dblat3.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat3c: cblat3.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat3z: zblat3.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

+.PHONY: run
 run: all
 	./xblat1s > sblat1.out
 	./xblat1d > dblat1.out
@ -47,6 +50,7 @@ run: all
 	./xblat3c < cblat3.in
 	./xblat3z < zblat3.in

+.PHONY: clean cleanobj cleanexe cleantest
 clean: cleanobj cleanexe cleantest
 cleanobj:
 	rm -f *.o
@ -54,6 +58,3 @@ cleanexe:
 	rm -f xblat*
 cleantest:
 	rm -f *.out core
-
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
--- a/lapack-netlib/BLAS/TESTING/cblat1.f
+++ b/lapack-netlib/BLAS/TESTING/cblat1.f
@ -619,7 +619,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/lapack-netlib/BLAS/TESTING/dblat1.f
+++ b/lapack-netlib/BLAS/TESTING/dblat1.f
@ -991,7 +991,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/lapack-netlib/BLAS/TESTING/sblat1.f
+++ b/lapack-netlib/BLAS/TESTING/sblat1.f
@ -946,7 +946,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/lapack-netlib/BLAS/TESTING/zblat1.f
+++ b/lapack-netlib/BLAS/TESTING/zblat1.f
@ -619,7 +619,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/lapack-netlib/CBLAS/CMakeLists.txt
+++ b/lapack-netlib/CBLAS/CMakeLists.txt
@ -12,8 +12,10 @@ FortranCInterface_HEADER(${LAPACK_BINARY_DIR}/include/cblas_mangling.h
                         SYMBOL_NAMESPACE "F77_")
 if(NOT FortranCInterface_GLOBAL_FOUND OR NOT FortranCInterface_MODULE_FOUND)
  message(WARNING "Reverting to pre-defined include/lapacke_mangling.h")
-  configure_file(include/lapacke_mangling_with_flags.h.in
-                 ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h)
+    configure_file(include/lapacke_mangling_with_flags.h.in
+                  ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h)
+    configure_file(include/cblas_mangling_with_flags.h.in
+                 ${LAPACK_BINARY_DIR}/include/cblas_mangling.h)
 endif()

 include_directories(include ${LAPACK_BINARY_DIR}/include)
@ -28,7 +30,10 @@ endforeach()
 endmacro()

 append_subdir_files(CBLAS_INCLUDE "include")
-install(FILES ${CBLAS_INCLUDE} ${LAPACK_BINARY_DIR}/include/cblas_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES ${CBLAS_INCLUDE} ${LAPACK_BINARY_DIR}/include/cblas_mangling.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  COMPONENT Development
+  )

 # --------------------------------------------------
 if(BUILD_TESTING)
@ -45,7 +50,9 @@ endif()
 set(_cblas_config_install_guard_target "")
 if(ALL_TARGETS)
  install(EXPORT cblas-targets
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION})
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION}
+    COMPONENT Development
+    )
  # Choose one of the cblas targets to use as a guard for
  # cblas-config.cmake to load targets from the install tree.
  list(GET ALL_TARGETS 0 _cblas_config_install_guard_target)
@ -82,4 +89,6 @@ install(FILES
  )

 #install(EXPORT cblas-targets
-#  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION})
+#  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION}
+#  COMPONENT Development
+#  )
--- a/lapack-netlib/CBLAS/Makefile
+++ b/lapack-netlib/CBLAS/Makefile
@ -1,19 +1,25 @@
-include ../make.inc
+TOPSRCDIR = ..
+include $(TOPSRCDIR)/make.inc

+.PHONY: all
 all: cblas

+.PHONY: cblas
 cblas: include/cblas_mangling.h
 	$(MAKE) -C src

 include/cblas_mangling.h: include/cblas_mangling_with_flags.h.in
-	cp $< $@
+	cp include/cblas_mangling_with_flags.h.in $@

+.PHONY: cblas_testing
 cblas_testing: cblas
 	$(MAKE) -C testing run

+.PHONY: cblas_example
 cblas_example: cblas
 	$(MAKE) -C examples

+.PHONY: clean cleanobj cleanlib cleanexe cleantest
 clean:
 	$(MAKE) -C src clean
 	$(MAKE) -C testing clean
--- a/lapack-netlib/CBLAS/examples/Makefile
+++ b/lapack-netlib/CBLAS/examples/Makefile
@ -1,17 +1,21 @@
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc

+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I../include -c -o $@ $<
+
+.PHONY: all
 all: cblas_ex1 cblas_ex2

 cblas_ex1: cblas_example1.o $(CBLASLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 cblas_ex2: cblas_example2.o $(CBLASLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

+.PHONY: clean cleanobj cleanexe
 clean: cleanobj cleanexe
 cleanobj:
 	rm -f *.o
 cleanexe:
 	rm -f cblas_ex1 cblas_ex2
-
-.c.o:
-	$(CC) $(CFLAGS) -I../include -c -o $@ $<
--- a/lapack-netlib/CBLAS/examples/cblas_example1.c
+++ b/lapack-netlib/CBLAS/examples/cblas_example1.c
@ -47,7 +47,7 @@ int main ( )
   a[m*3+1] = 6;
   a[m*3+2] = 7;
   a[m*3+3] = 8;
-   /* The elemetns of x and y */
+   /* The elements of x and y */
   x[0] = 1;
   x[1] = 2;
   x[2] = 1;
--- a/lapack-netlib/CBLAS/src/Makefile
+++ b/lapack-netlib/CBLAS/src/Makefile
@ -1,7 +1,13 @@
 # This Makefile compiles the CBLAS routines

-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc

+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I../include -c -o $@ $<
+
+.PHONY: all
 all: $(CBLASLIB)

 # Error handling routines for level 2 & 3
@ -43,24 +49,25 @@ zlev1 = cblas_zswap.o cblas_zscal.o cblas_zdscal.o cblas_zcopy.o \
 # Common files for level 1 single precision
 sclev1 = cblas_scasum.o scasumsub.o cblas_scnrm2.o scnrm2sub.o

+.PHONY: slib1 dlib1 clib1 zlib1
 # Single precision real
 slib1: $(slev1) $(sclev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # Double precision real
 dlib1: $(dlev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # Single precision complex
 clib1: $(clev1) $(sclev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # Double precision complex
 zlib1: $(zlev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 #
@ -95,24 +102,25 @@ zlev2 = cblas_zgemv.o cblas_zgbmv.o cblas_zhemv.o cblas_zhbmv.o cblas_zhpmv.o \
        cblas_ztpsv.o cblas_zgeru.o cblas_zgerc.o cblas_zher.o  cblas_zher2.o \
        cblas_zhpr.o  cblas_zhpr2.o

+.PHONY: slib2 dlib2 clib2 zlib2
 # Single precision real
 slib2: $(slev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # Double precision real
 dlib2: $(dlev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # Single precision complex
 clib2: $(clev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # Double precision complex
 zlib2: $(zlev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 #
@ -141,24 +149,25 @@ zlev3 = cblas_zgemm.o cblas_zsymm.o cblas_zhemm.o cblas_zherk.o \
        cblas_zher2k.o cblas_ztrmm.o cblas_ztrsm.o cblas_zsyrk.o \
        cblas_zsyr2k.o

+.PHONY: slib3 dlib3 clib3 zlib3
 # Single precision real
 slib3: $(slev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # Double precision real
 dlib3: $(dlev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # Single precision complex
 clib3: $(clev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # Double precision complex
 zlib3: $(zlev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)


@ -166,36 +175,33 @@ alev1 = $(slev1) $(dlev1) $(clev1) $(zlev1) $(sclev1)
 alev2 = $(slev2) $(dlev2) $(clev2) $(zlev2)
 alev3 = $(slev3) $(dlev3) $(clev3) $(zlev3)

+.PHONY: all1 all2 all3
 # All level 1
 all1: $(alev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # All level 2
 all2: $(alev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # All level 3
 all3: $(alev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)

 # All levels and precisions
 $(CBLASLIB): $(alev1) $(alev2) $(alev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@

 FRC:
 	@FRC=$(FRC)

+.PHONY: clean cleanobj cleanlib
 clean: cleanobj cleanlib
 cleanobj:
 	rm -f *.o
 cleanlib:
 	rm -f $(CBLASLIB)
-
-.c.o:
-	$(CC) $(CFLAGS) -I../include -c -o $@ $<
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
--- a/lapack-netlib/CBLAS/src/cblas_sgemm.c
+++ b/lapack-netlib/CBLAS/src/cblas_sgemm.c
@ -91,7 +91,7 @@ void cblas_sgemm(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA,
      else
      {
         cblas_xerbla(2, "cblas_sgemm",
-                       "Illegal TransA setting, %d\n", TransA);
+                       "Illegal TransB setting, %d\n", TransB);
         CBLAS_CallFromC = 0;
         RowMajorStrg = 0;
         return;
--- a/lapack-netlib/CBLAS/testing/Makefile
+++ b/lapack-netlib/CBLAS/testing/Makefile
@ -2,7 +2,12 @@
 # The Makefile compiles c wrappers and testers for CBLAS.
 #

-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
+
+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I../include -c -o $@ $<

 # Archive files necessary to compile
 LIB = $(CBLASLIB) $(BLASLIB)
@ -27,6 +32,7 @@ ztestl1o = c_zblas1.o
 ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o
 ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o

+.PHONY: all all1 all2 all3
 all: all1 all2 all3
 all1: xscblat1 xdcblat1 xccblat1 xzcblat1
 all2: xscblat2 xdcblat2 xccblat2 xzcblat2
@ -38,37 +44,38 @@ all3: xscblat3 xdcblat3 xccblat3 xzcblat3

 # Single real
 xscblat1: c_sblat1.o $(stestl1o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xscblat2: c_sblat2.o $(stestl2o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xscblat3: c_sblat3.o $(stestl3o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 # Double real
 xdcblat1: c_dblat1.o $(dtestl1o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xdcblat2: c_dblat2.o $(dtestl2o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xdcblat3: c_dblat3.o $(dtestl3o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

 # Single complex
 xccblat1: c_cblat1.o $(ctestl1o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xccblat2: c_cblat2.o $(ctestl2o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xccblat3: c_cblat3.o $(ctestl3o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

 # Double complex
 xzcblat1: c_zblat1.o $(ztestl1o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xzcblat2: c_zblat2.o $(ztestl2o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xzcblat3: c_zblat3.o $(ztestl3o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^


 # RUN TESTS
+.PHONY: run
 run: all
 	@echo "--> TESTING CBLAS 1 - SINGLE PRECISION REAL <--"
 	@./xscblat1 > stest1.out
@ -95,6 +102,7 @@ run: all
 	@echo "--> TESTING CBLAS 3 - DOUBLE PRECISION COMPLEX <--"
 	@./xzcblat3 < zin3 > ztest3.out

+.PHONY: clean cleanobj cleanexe cleantest
 clean: cleanobj cleanexe cleantest
 cleanobj:
 	rm -f *.o
@ -102,9 +110,3 @@ cleanexe:
 	rm -f x*
 cleantest:
 	rm -f *.out core
-
-.SUFFIXES: .o .f .c
-.c.o:
-	$(CC) $(CFLAGS) -I../include -c -o $@ $<
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
--- a/lapack-netlib/CBLAS/testing/c_cblat1.f
+++ b/lapack-netlib/CBLAS/testing/c_cblat1.f
@ -577,7 +577,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/lapack-netlib/CBLAS/testing/c_dblat1.f
+++ b/lapack-netlib/CBLAS/testing/c_dblat1.f
@ -653,7 +653,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/lapack-netlib/CBLAS/testing/c_sblat1.f
+++ b/lapack-netlib/CBLAS/testing/c_sblat1.f
@ -653,7 +653,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/lapack-netlib/CBLAS/testing/c_zblat1.f
+++ b/lapack-netlib/CBLAS/testing/c_zblat1.f
@ -577,7 +577,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake
+++ b/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake
@ -1,4 +1,4 @@
-# This module checks against various known compilers and thier respective
+# This module checks against various known compilers and their respective
 # flags to determine any specific flags needing to be set.
 #
 #  1.  If FPE traps are enabled either abort or disable them
--- a/lapack-netlib/CMAKE/FindGcov.cmake
+++ b/lapack-netlib/CMAKE/FindGcov.cmake
@ -20,7 +20,7 @@ set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY})

 get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
 foreach (LANG ${ENABLED_LANGUAGES})
-  # Gcov evaluation is dependend on the used compiler. Check gcov support for
+  # Gcov evaluation is dependent on the used compiler. Check gcov support for
  # each compiler that is used. If gcov binary was already found for this
  # compiler, do not try to find it again.
  if(NOT GCOV_${CMAKE_${LANG}_COMPILER_ID}_BIN)
--- a/lapack-netlib/CMAKE/Findcodecov.cmake
+++ b/lapack-netlib/CMAKE/Findcodecov.cmake
@ -42,7 +42,7 @@ set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY})

 get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
 foreach (LANG ${ENABLED_LANGUAGES})
-  # Coverage flags are not dependend on language, but the used compiler. So
+  # Coverage flags are not dependent on language, but the used compiler. So
  # instead of searching flags foreach language, search flags foreach compiler
  # used.
  set(COMPILER ${CMAKE_${LANG}_COMPILER_ID})
--- a/lapack-netlib/CMAKE/FortranMangling.cmake
+++ b/lapack-netlib/CMAKE/FortranMangling.cmake
@ -24,7 +24,7 @@ message(STATUS "=========")
    set(F77_OUTPUT_EXE "/Fe" CACHE INTERNAL
      "Fortran compiler option for setting executable file name.")
  else()
-    # in other case, let user specify their fortran configrations.
+    # in other case, let user specify their fortran configurations.
    set(F77_OPTION_COMPILE "-c" CACHE STRING
      "Fortran compiler option for compiling without linking.")
    set(F77_OUTPUT_OBJ "-o" CACHE STRING
--- a/lapack-netlib/CMAKE/lapack-config-build.cmake.in
+++ b/lapack-netlib/CMAKE/lapack-config-build.cmake.in
@ -5,6 +5,10 @@ if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}")
 endif()
 unset(_LAPACK_TARGET)

+# Hint for project building against lapack
+set(LAPACK_Fortran_COMPILER_ID "@CMAKE_Fortran_COMPILER_ID@")
+
 # Report the blas and lapack raw or imported libraries.
 set(LAPACK_blas_LIBRARIES "@BLAS_LIBRARIES@")
 set(LAPACK_lapack_LIBRARIES "@LAPACK_LIBRARIES@")
+set(LAPACK_LIBRARIES ${LAPACK_blas_LIBRARIES} ${LAPACK_lapack_LIBRARIES})
--- a/lapack-netlib/CMAKE/lapack-config-install.cmake.in
+++ b/lapack-netlib/CMAKE/lapack-config-install.cmake.in
@ -8,8 +8,12 @@ if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}")
 endif()
 unset(_LAPACK_TARGET)

+# Hint for project building against lapack
+set(LAPACK_Fortran_COMPILER_ID "@CMAKE_Fortran_COMPILER_ID@")
+
 # Report the blas and lapack raw or imported libraries.
 set(LAPACK_blas_LIBRARIES "@BLAS_LIBRARIES@")
 set(LAPACK_lapack_LIBRARIES "@LAPACK_LIBRARIES@")
+set(LAPACK_LIBRARIES ${LAPACK_blas_LIBRARIES} ${LAPACK_lapack_LIBRARIES})

 unset(_LAPACK_SELF_DIR)
--- a/lapack-netlib/CMakeLists.txt
+++ b/lapack-netlib/CMakeLists.txt
@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8.12)
 project(LAPACK Fortran C)

 set(LAPACK_MAJOR_VERSION 3)
-set(LAPACK_MINOR_VERSION 8)
+set(LAPACK_MINOR_VERSION 9)
 set(LAPACK_PATCH_VERSION 0)
 set(
  LAPACK_VERSION
@ -13,6 +13,9 @@ set(
 # Add the CMake directory for custon CMake modules
 set(CMAKE_MODULE_PATH "${LAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH})

+# Export all symbols on Windows when building shared libraries
+SET(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
+
 # Set a default build type if none was specified
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
  message(STATUS "Setting build type to 'Release' as none was specified.")
@ -21,8 +24,19 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "Coverage")
 endif()

-string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
-if(${CMAKE_BUILD_TYPE_UPPER} STREQUAL "COVERAGE")
+# Coverage
+set(_is_coverage_build 0)
+set(_msg "Checking if build type is 'Coverage'")
+message(STATUS "${_msg}")
+if(NOT CMAKE_CONFIGURATION_TYPES)
+  string(TOLOWER ${CMAKE_BUILD_TYPE} _build_type_lc)
+  if(${_build_type_lc} STREQUAL "coverage")
+    set(_is_coverage_build 1)
+  endif()
+endif()
+message(STATUS "${_msg}: ${_is_coverage_build}")
+
+if(_is_coverage_build)
  message(STATUS "Adding coverage")
  find_package(codecov)
 endif()
@ -58,18 +72,18 @@ include(PreventInSourceBuilds)
 include(PreventInBuildInstalls)

 if(UNIX)
-  if("${CMAKE_Fortran_COMPILER}" MATCHES "ifort")
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
+  if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
+    list(APPEND CMAKE_Fortran_FLAGS "-fp-model strict")
  endif()
-  if("${CMAKE_Fortran_COMPILER}" MATCHES "xlf")
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none")
+  if(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
+    list(APPEND CMAKE_Fortran_FLAGS "-qnosave -qstrict=none")
  endif()
 # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
 # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
  string(REPLACE \;mtsk\; \; CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}")
 endif()

-if(CMAKE_Fortran_COMPILER_ID STREQUAL "Compaq")
+if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq)
  if(WIN32)
    if(CMAKE_GENERATOR STREQUAL "NMake Makefiles")
      get_filename_component(CMAKE_Fortran_COMPILER_CMDNAM ${CMAKE_Fortran_COMPILER} NAME_WE)
@ -96,24 +110,16 @@ if(CMAKE_Fortran_COMPILER_ID STREQUAL "Compaq")
  endif()
 endif()

-# Get Python
-message(STATUS "Looking for Python greater than 2.6 - ${PYTHONINTERP_FOUND}")
-find_package(PythonInterp 2.7) # lapack_testing.py uses features from python 2.7 and greater
-if(PYTHONINTERP_FOUND)
-  message(STATUS "Using Python version ${PYTHON_VERSION_STRING}")
-else()
-  message(STATUS "No suitable Python version found, so skipping summary tests.")
-endif()
-# --------------------------------------------------

+# --------------------------------------------------
 set(LAPACK_INSTALL_EXPORT_NAME lapack-targets)

 macro(lapack_install_library lib)
  install(TARGETS ${lib}
    EXPORT ${LAPACK_INSTALL_EXPORT_NAME}
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT Development
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT RuntimeLibraries
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT RuntimeLibraries
  )
 endmacro()

@ -121,12 +127,22 @@ set(PKG_CONFIG_DIR ${CMAKE_INSTALL_LIBDIR}/pkgconfig)

 # --------------------------------------------------
 # Testing
-option(BUILD_TESTING "Build tests" OFF)
-enable_testing()
+option(BUILD_TESTING "Build tests" ${_is_coverage_build})
 include(CTest)
-enable_testing()
 message(STATUS "Build tests: ${BUILD_TESTING}")

+# lapack_testing.py uses features from python 2.7 and greater
+if(BUILD_TESTING)
+  set(_msg "Looking for Python >= 2.7 needed for summary tests")
+  message(STATUS "${_msg}")
+  find_package(PythonInterp 2.7 QUIET)
+  if(PYTHONINTERP_FOUND)
+    message(STATUS "${_msg} - found (${PYTHON_VERSION_STRING})")
+  else()
+    message(STATUS "${_msg} - not found (skipping summary tests)")
+  endif()
+endif()
+
 # --------------------------------------------------
 # Organize output files.  On Windows this also keeps .dll files next
 # to the .exe files that need them, making tests easy to run.
@ -299,16 +315,40 @@ if(LAPACKE)
  add_subdirectory(LAPACKE)
 endif()

+#-------------------------------------
+# BLAS++ / LAPACK++
+option(BLAS++ "Build BLAS++" OFF)
+option(LAPACK++ "Build LAPACK++" OFF)
+ 
+ 
+function(_display_cpp_implementation_msg name)
+  string(TOLOWER ${name} name_lc)
+  message(STATUS "${name}++ enable")
+  message(STATUS "----------------")
+  message(STATUS "Thank you for your interest in ${name}++, a newly developed C++ API for ${name} library")
+  message(STATUS "The objective of ${name}++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc.")
+  message(STATUS "We are still working on integrating ${name}++ in our library. For the moment, you can download directly ${name_lc}++ from https://bitbucket.org/icl/${name_lc}pp")
+  message(STATUS "For support ${name}++ related question, please email: slate-user@icl.utk.edu")
+  message(STATUS "----------------")
+endfunction()
+if(BLAS++)
+  _display_cpp_implementation_msg("BLAS")
+endif()
+if(LAPACK++)
+  _display_cpp_implementation_msg("LAPACK")
+endif()
+
 # --------------------------------------------------
 # CPACK Packaging

 set(CPACK_PACKAGE_NAME "LAPACK")
 set(CPACK_PACKAGE_VENDOR "University of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "LAPACK- Linear Algebra Package")
-set(CPACK_PACKAGE_VERSION_MAJOR 3)
-set(CPACK_PACKAGE_VERSION_MINOR 5)
-set(CPACK_PACKAGE_VERSION_PATCH 0)
+set(CPACK_PACKAGE_VERSION_MAJOR ${LAPACK_MAJOR_VERSION})
+set(CPACK_PACKAGE_VERSION_MINOR ${LAPACK_MINOR_VERSION})
+set(CPACK_PACKAGE_VERSION_PATCH ${LAPACK_PATCH_VERSION})
 set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+set(CPACK_MONOLITHIC_INSTALL ON)
 set(CPACK_PACKAGE_INSTALL_DIRECTORY "LAPACK")
 if(WIN32 AND NOT UNIX)
  # There is a bug in NSI that does not handle full unix paths properly. Make
@ -347,7 +387,9 @@ endif()
 set(_lapack_config_install_guard_target "")
 if(ALL_TARGETS)
  install(EXPORT lapack-targets
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION})
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION}
+    COMPONENT Development
+    )

  # Choose one of the lapack targets to use as a guard for
  # lapack-config.cmake to load targets from the install tree.
@ -382,6 +424,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapack.pc.in ${CMAKE_CURRENT_BINARY_D
  install(FILES
  ${CMAKE_CURRENT_BINARY_DIR}/lapack.pc
  DESTINATION ${PKG_CONFIG_DIR}
+  COMPONENT Development
  )

 configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-install.cmake.in
@ -398,4 +441,6 @@ install(FILES
  ${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake
  ${LAPACK_BINARY_DIR}/lapack-config-version.cmake
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION}
+  COMPONENT Development
  )
+  
--- a/lapack-netlib/DOCS/Doxyfile
+++ b/lapack-netlib/DOCS/Doxyfile
@ -38,7 +38,7 @@ PROJECT_NAME           = LAPACK
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = 3.8.0
+PROJECT_NUMBER         = 3.9.0

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
--- a/lapack-netlib/DOCS/Doxyfile_man
+++ b/lapack-netlib/DOCS/Doxyfile_man
@ -38,7 +38,7 @@ PROJECT_NAME           = LAPACK
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = 3.8.0
+PROJECT_NUMBER         = 3.9.0

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
--- a/lapack-netlib/DOCS/lawn81.tex
+++ b/lapack-netlib/DOCS/lawn81.tex
@ -439,39 +439,39 @@ SHELL = /bin/sh
 \end{quote}
 and it will need to be modified to \texttt{SHELL = /sbin/sh} if you are
 installing LAPACK on an SGI architecture.
-Second, you will
-need to modify the \texttt{PLAT} definition, which is appended to all
-library names, to specify the architecture to which you are installing
-LAPACK.  This features avoids confusion in library names when you are
-installing LAPACK on more than one architecture.  Next, you will need
-to modify \texttt{FORTRAN}, \texttt{OPTS}, \texttt{DRVOPTS}, \texttt{NOOPT}, \texttt{LOADER},
-and \texttt{LOADOPTS} to specify
+Next, you will need to modify \texttt{FC}, \texttt{FFLAGS},
+\texttt{FFLAGS\_DRV}, \texttt{FFLAGS\_NOOPT}, and \texttt{LDFLAGS} to specify
 the compiler, compiler options, compiler options for the testing and
-timing\footnotemark[\value{footnote}] main programs, loader, loader options.
-Next you will have to choose which function you will use to time in the \texttt{SECOND} and \texttt{DSECND} routines.
+timing\footnotemark[\value{footnote}] main programs, and linker options.
+Next you will have to choose which function you will use to time in the
+\texttt{SECOND} and \texttt{DSECND} routines.
 \begin{verbatim}
-#The Default : SECOND and DSECND will use a call to the EXTERNAL FUNCTION ETIME
-TIMER    = EXT_ETIME
-# For RS6K : SECOND and DSECND will use a call to the EXTERNAL FUNCTION ETIME_
-# TIMER    = EXT_ETIME_
-# For gfortran compiler: SECOND and DSECND will use the INTERNAL FUNCTION ETIME
-# TIMER    = INT_ETIME
-# If your Fortran compiler does not provide etime (like Nag Fortran Compiler, etc...)
-# SECOND and DSECND will use a call to the INTERNAL FUNCTION CPU_TIME
-# TIMER    = INT_CPU_TIME
-# If neither of this works...you can use the NONE value...
-# In that case, SECOND and DSECND will always return 0
-# TIMER     = NONE
+#  Default:  SECOND and DSECND will use a call to the
+#  EXTERNAL FUNCTION ETIME
+#TIMER = EXT_ETIME
+#  For RS6K:  SECOND and DSECND will use a call to the
+#  EXTERNAL FUNCTION ETIME_
+#TIMER = EXT_ETIME_
+#  For gfortran compiler:  SECOND and DSECND will use a call to the
+#  INTERNAL FUNCTION ETIME
+TIMER = INT_ETIME
+#  If your Fortran compiler does not provide etime (like Nag Fortran
+#  Compiler, etc...) SECOND and DSECND will use a call to the
+#  INTERNAL FUNCTION CPU_TIME
+#TIMER = INT_CPU_TIME
+#  If none of these work, you can use the NONE value.
+#  In that case, SECOND and DSECND will always return 0.
+#TIMER = NONE
 \end{verbatim}
 Refer to the section~\ref{second} to get more information.


-Next, you will need to modify \texttt{ARCH}, \texttt{ARCHFLAGS}, and \texttt{RANLIB} to specify archiver,
+Next, you will need to modify \texttt{AR}, \texttt{ARFLAGS}, and \texttt{RANLIB} to specify archiver,
 archiver options, and ranlib for your machine.  If your architecture
 does not require \texttt{ranlib} to be run after each archive command (as
 is the case with CRAY computers running UNICOS, Hewlett Packard
 computers running HP-UX, or SUN SPARCstations running Solaris), set
-\texttt{ranlib=echo}.  And finally, you must
+\texttt{RANLIB = echo}.  And finally, you must
 modify the \texttt{BLASLIB} definition to specify the BLAS library to which
 you will be linking.  If an optimized version of the BLAS is available
 on your machine, you are highly recommended to link to that library.
@ -721,24 +721,24 @@ The version that will be used depends on the value of the TIMER variable in the

 \begin{itemize}
 \item If ETIME is available as an external function, set the value of the TIMER variable in your
-make.inc to \texttt{EXT\_ETIME}:\texttt{second\_EXT\_ETIME.f} and \texttt{dsecnd\_EXT\_ETIME.f} will be used.
+make.inc to \texttt{EXT\_ETIME}: \texttt{second\_EXT\_ETIME.f} and \texttt{dsecnd\_EXT\_ETIME.f} will be used.
 Usually on HPPA architectures,
-the compiler and loader flag \texttt{+U77} should be included to access
+the compiler and linker flag \texttt{+U77} should be included to access
 the function \texttt{ETIME}.

 \item If ETIME\_ is available as an external function, set the value of the TIMER variable in your make.inc
-to \texttt{EXT\_ETIME\_}:\texttt{second\_EXT\_ETIME\_.f} and \texttt{dsecnd\_EXT\_ETIME\_.f} will be used.
+to \texttt{EXT\_ETIME\_}: \texttt{second\_EXT\_ETIME\_.f} and \texttt{dsecnd\_EXT\_ETIME\_.f} will be used.
 It is the case on some IBM architectures such as IBM RS/6000s.

 \item If ETIME is available as an internal function, set the value of the TIMER variable in your make.inc
-to \texttt{INT\_ETIME}:\texttt{second\_INT\_ETIME.f}  and \texttt{dsecnd\_INT\_ETIME.f} will be used.
+to \texttt{INT\_ETIME}: \texttt{second\_INT\_ETIME.f}  and \texttt{dsecnd\_INT\_ETIME.f} will be used.
 This is the case with gfortan.

 \item If CPU\_TIME is available as an internal function, set the value of the TIMER variable in your make.inc
-to \texttt{INT\_CPU\_TIME}:\texttt{second\_INT\_CPU\_TIME.f} and \texttt{dsecnd\_INT\_CPU\_TIME.f} will be used.
+to \texttt{INT\_CPU\_TIME}: \texttt{second\_INT\_CPU\_TIME.f} and \texttt{dsecnd\_INT\_CPU\_TIME.f} will be used.

 \item If none of these function is available, set the value of the TIMER variable in your make.inc
-to \texttt{NONE:}\texttt{second\_NONE.f} and \texttt{dsecnd\_NONE.f} will be used.
+to \texttt{NONE}: \texttt{second\_NONE.f} and \texttt{dsecnd\_NONE.f} will be used.
 These routines will always return zero.
 \end{itemize}

@ -829,8 +829,8 @@ data type to the library if necessary.
 \end{itemize}

 \noindent
-The BLAS library is created in \texttt{LAPACK/blas\_PLAT.a}, where
-\texttt{PLAT} is the user-defined architecture suffix specified in the file
+The BLAS library is created in \texttt{LAPACK/librefblas.a},
+or in the user-defined location specified by \texttt{BLASLIB} in the file
 \texttt{LAPACK/make.inc}.

 \subsection{Run the BLAS Test Programs}\label{testblas}
@ -882,8 +882,8 @@ data type to the library if necessary.
 \end{itemize}

 \noindent
-The LAPACK library is created in \texttt{LAPACK/lapack\_PLAT.a}, where
-\texttt{PLAT} is the user-defined architecture suffix specified in the file
+The LAPACK library is created in \texttt{LAPACK/liblapack.a},
+or in the user-defined location specified by \texttt{LAPACKLIB} in the file
 \texttt{LAPACK/make.inc}.

 \subsection{Create the Test Matrix Generator Library}
@ -902,9 +902,9 @@ data type to the library if necessary.
 \end{itemize}

 \noindent
-The test matrix generator library is created in \texttt{LAPACK/tmglib\_PLAT.a},
-where \texttt{PLAT} is the user-defined architecture suffix specified in the
-file \texttt{LAPACK/make.inc}.
+The test matrix generator library is created in \texttt{LAPACK/libtmglib.a},
+or in the user-defined location specified by \texttt{TMGLIB} in the file
+\texttt{LAPACK/make.inc}.

 \subsection{Run the LAPACK Test Programs}

@ -1114,9 +1114,7 @@ To make a library of the instrumented LAPACK routines, first
 go to \texttt{LAPACK/TIMING/LIN/LINSRC} and type \texttt{make} followed
 by the data types desired, as in the examples of Section~\ref{toplevelmakefile}.
 The library of instrumented code is created in
-\texttt{LAPACK/TIMING/LIN/linsrc\_PLAT.a},
-where \texttt{PLAT} is the user-defined architecture suffix specified in the
-file \texttt{LAPACK/make.inc}.
+\texttt{LAPACK/TIMING/LIN/linsrc.a}.
 \end{sloppypar}

 \item[b)]
@ -1251,9 +1249,7 @@ To make a library of the instrumented LAPACK routines, first
 go to \texttt{LAPACK/TIMING/EIG/EIGSRC} and type \texttt{make} followed
 by the data types desired, as in the examples of Section~\ref{toplevelmakefile}.
 The library of instrumented code is created in
-\texttt{LAPACK/TIMING/EIG/eigsrc\_PLAT.a},
-where \texttt{PLAT} is the user-defined architecture suffix specified in the
-file \texttt{LAPACK/make.inc}.
+\texttt{LAPACK/TIMING/EIG/eigsrc.a}.
 \end{sloppypar}

 \item[b)]
@ -1389,7 +1385,7 @@ installing LAPACK on an SGI architecture.
 \section{ETIME}

 On HPPA architectures,
-the compiler and loader flag \texttt{+U77} should be included to access
+the compiler and linker flag \texttt{+U77} should be included to access
 the function \texttt{ETIME}.

 \section{ILAENV and IEEE-754 compliance}
@ -1494,13 +1490,13 @@ has two options:  increase your stack size, or force all local variables
 to be allocated statically.

 On HPPA architectures, the
-compiler and loader flag \texttt{-K} should be used when compiling these testing
+compiler and linker flag \texttt{-K} should be used when compiling these testing
 and timing main programs to avoid such a stack overflow.  I.e., set
-\texttt{DRVOPTS = -K} in the \texttt{LAPACK/make.inc} file.
+\texttt{FFLAGS\_DRV = -K} in the \texttt{LAPACK/make.inc} file.

 For similar reasons,
-on SGI architectures, the compiler and loader flag \texttt{-static} should be
-used.  I.e., set \texttt{DRVOPTS = -static} in the \texttt{LAPACK/make.inc} file.
+on SGI architectures, the compiler and linker flag \texttt{-static} should be
+used.  I.e., set \texttt{FFLAGS\_DRV = -static} in the \texttt{LAPACK/make.inc} file.

 \section{IEEE arithmetic}

--- a/lapack-netlib/INSTALL/Makefile
+++ b/lapack-netlib/INSTALL/Makefile
@ -1,30 +1,33 @@
-include ../make.inc
+TOPSRCDIR = ..
+include $(TOPSRCDIR)/make.inc

+.PHONY: all testlsame testslamch testdlamch testsecond testdsecnd testieee testversion
 all: testlsame testslamch testdlamch testsecond testdsecnd testieee testversion

 testlsame: lsame.o lsametst.o
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

 testslamch: slamch.o lsame.o slamchtst.o
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

 testdlamch: dlamch.o lsame.o dlamchtst.o
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

 testsecond: second_$(TIMER).o secondtst.o
 	@echo "[INFO] : TIMER value: $(TIMER) (given by make.inc)"
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

 testdsecnd: dsecnd_$(TIMER).o dsecndtst.o
 	@echo "[INFO] : TIMER value: $(TIMER) (given by make.inc)"
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

 testieee: tstiee.o
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

 testversion: ilaver.o LAPACK_version.o
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^

+.PHONY: run
 run: all
 	./testlsame
 	./testslamch
@ -34,6 +37,7 @@ run: all
 	./testieee
 	./testversion

+.PHONY: clean cleanobj cleanexe cleantest
 clean: cleanobj cleanexe cleantest
 cleanobj:
 	rm -f *.o
@ -42,9 +46,5 @@ cleanexe:
 cleantest:
 	rm -f core

-.SUFFIXES: .o .f
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
-
-slamch.o: slamch.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
-dlamch.o: dlamch.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
+slamch.o: slamch.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
+dlamch.o: dlamch.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
--- a/lapack-netlib/INSTALL/dlamch.f
+++ b/lapack-netlib/INSTALL/dlamch.f
@ -10,6 +10,10 @@
 *
 *      DOUBLE PRECISION FUNCTION DLAMCH( CMACH )
 *
+*     .. Scalar Arguments ..
+*     CHARACTER          CMACH
+*     ..
+*
 *
 *> \par Purpose:
 *  =============
@ -24,6 +28,7 @@
 *
 *> \param[in] CMACH
 *> \verbatim
+*>          CMACH is CHARACTER*1
 *>          Specifies the value to be returned by DLAMCH:
 *>          = 'E' or 'e',   DLAMCH := eps
 *>          = 'S' or 's ,   DLAMCH := sfmin
--- a/lapack-netlib/INSTALL/dlamchf77.f
+++ b/lapack-netlib/INSTALL/dlamchf77.f
@ -10,6 +10,10 @@
 *
 *      DOUBLE PRECISION FUNCTION DLAMCH( CMACH )
 *
+*     .. Scalar Arguments ..
+*     CHARACTER          CMACH
+*     ..
+*
 *
 *> \par Purpose:
 *  =============
--- a/lapack-netlib/INSTALL/ilaver.f
+++ b/lapack-netlib/INSTALL/ilaver.f
@ -25,12 +25,15 @@
 *  ==========
 *
 *>  \param[out] VERS_MAJOR
+*>      VERS_MAJOR is INTEGER
 *>      return the lapack major version
 *>
 *>  \param[out] VERS_MINOR
+*>      VERS_MINOR is INTEGER
 *>      return the lapack minor version from the major version
 *>
 *>  \param[out] VERS_PATCH
+*>      VERS_PATCH is INTEGER
 *>      return the lapack patch version from the minor version
 *
 *  Authors:
@ -41,24 +44,23 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date June 2017
+*> \date November 2019
 *
 *> \ingroup auxOTHERauxiliary
 *
 *  =====================================================================
      SUBROUTINE ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH )
 *
-*  -- LAPACK computational routine (version 3.7.1) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     June 2017
 *
 *  =====================================================================
 *
      INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH
 *  =====================================================================
      VERS_MAJOR = 3
-      VERS_MINOR = 8
+      VERS_MINOR = 9
      VERS_PATCH = 0
 *  =====================================================================
 *
--- a/lapack-netlib/INSTALL/make.inc.ALPHA
+++ b/lapack-netlib/INSTALL/make.inc.ALPHA
@ -8,30 +8,28 @@ SHELL = /bin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O4

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = -O4 -fpe1
-DRVOPTS = $(OPTS)
-NOOPT   =
+FC = f77
+FFLAGS = -O4 -fpe1
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT =

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS =
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib

 #  Timer for the SECOND and DSECND routines
 #
@ -74,9 +72,9 @@ TIMER = EXT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-#BLASLIB      = ../../librefblas.a
+#BLASLIB      = $(TOPSRCDIR)/librefblas.a
 BLASLIB      = -ldxml
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.HPPA
+++ b/lapack-netlib/INSTALL/make.inc.HPPA
@ -8,30 +8,28 @@ SHELL = /bin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS =

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = +O4 +U77
-DRVOPTS = $(OPTS) -K
-NOOPT   = +U77
+FC = f77
+FFLAGS = +O4 +U77
+FFLAGS_DRV = $(FFLAGS) -K
+FFLAGS_NOOPT = +U77

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS = -Aa +U77
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo

 #  Timer for the SECOND and DSECND routines
 #
@ -74,9 +72,9 @@ TIMER = EXT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-#BLASLIB      = ../../librefblas.a
+#BLASLIB      = $(TOPSRCDIR)/librefblas.a
 BLASLIB      = -lblas
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.IRIX64
+++ b/lapack-netlib/INSTALL/make.inc.IRIX64
@ -8,33 +8,30 @@ SHELL = /sbin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O3

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
-#OPTS    = -g  -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON
-DRVOPTS = $(OPTS) -static
-NOOPT   = -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
-#NOOPT   = -g  -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON
+FC = f77
+FFLAGS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
+#FFLAGS = -g -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON
+FFLAGS_DRV = $(FFLAGS) -static
+FFLAGS_NOOPT = -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
+#FFLAGS_NOOPT = -g -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
-#LOADOPTS = -g  -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo

 #  Timer for the SECOND and DSECND routines
 #
@ -78,8 +75,8 @@ TIMER = EXT_ETIME
 #  possible.)
 #
 #BLASLIB      = -lblas
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.O2K
+++ b/lapack-netlib/INSTALL/make.inc.O2K
@ -8,33 +8,30 @@ SHELL = /sbin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O3

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = -O3 -64 -mips4 -r10000
-#OPTS    = -O3 -64 -mips4 -r10000 -mp
-DRVOPTS = $(OPTS) -static
-NOOPT   = -64 -mips4 -r10000
-#NOOPT   = -64 -mips4 -r10000 -mp
+FC = f77
+FFLAGS = -O3 -64 -mips4 -r10000
+#FFLAGS = -O3 -64 -mips4 -r10000 -mp
+FFLAGS_DRV = $(FFLAGS) -static
+FFLAGS_NOOPT = -64 -mips4 -r10000
+#FFLAGS_NOOPT = -64 -mips4 -r10000 -mp

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS = -O3 -64 -mips4 -r10000
-#LOADOPTS = -O3 -64 -mips4 -r10000 -mp
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo

 #  Timer for the SECOND and DSECND routines
 #
@ -79,8 +76,8 @@ TIMER = EXT_ETIME
 #
 BLASLIB      = -lblas
 #BLASLIB      = -lblas_mp
-#BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+#BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.SGI5
+++ b/lapack-netlib/INSTALL/make.inc.SGI5
@ -8,30 +8,28 @@ SHELL = /sbin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O4

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = -O4
-DRVOPTS = $(OPTS) -static
-NOOPT   =
+FC = f77
+FFLAGS = -O4
+FFLAGS_DRV = $(FFLAGS) -static
+FFLAGS_NOOPT =

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS =
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo

 #  Timer for the SECOND and DSECND routines
 #
@ -75,8 +73,8 @@ TIMER = EXT_ETIME
 #  possible.)
 #
 #BLASLIB      = -lblas
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.SUN4
+++ b/lapack-netlib/INSTALL/make.inc.SUN4
@ -8,30 +8,28 @@ SHELL = /bin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O3

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = -dalign -O4 -fast
-DRVOPTS = $(OPTS)
-NOOPT   =
+FC = f77
+FFLAGS = -dalign -O4 -fast
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT =

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS = -dalign -O4 -fast
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib

 #  Timer for the SECOND and DSECND routines
 #
@ -75,8 +73,8 @@ TIMER = EXT_ETIME
 #  possible.)
 #
 #BLASLIB      = -lblas
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.SUN4SOL2
+++ b/lapack-netlib/INSTALL/make.inc.SUN4SOL2
@ -8,34 +8,31 @@ SHELL = /bin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O3

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-#OPTS    = -O4 -u -f -mt
-#OPTS    = -u -f -dalign -native -xO5 -xarch=v8plusa
-OPTS    = -u -f -dalign -native -xO2 -xarch=v8plusa
-DRVOPTS = $(OPTS)
-NOOPT   = -u -f
-#NOOPT   = -u -f -mt
+FC = f77
+#FFLAGS = -O4 -u -f -mt
+#FFLAGS = -u -f -dalign -native -xO5 -xarch=v8plusa
+FFLAGS = -u -f -dalign -native -xO2 -xarch=v8plusa
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -u -f
+#FFLAGS_NOOPT = -u -f -mt

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-#LOADOPTS = -mt
-LOADOPTS = -f -dalign -native -xO2 -xarch=v8plusa
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo

 #  Timer for the SECOND and DSECND routines
 #
@ -78,10 +75,10 @@ TIMER = EXT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-#BLASLIB      = ../../librefblas.a
+#BLASLIB      = $(TOPSRCDIR)/librefblas.a
 #BLASLIB      = -xlic_lib=sunperf_mt
 BLASLIB      = -xlic_lib=sunperf
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.XLF
+++ b/lapack-netlib/INSTALL/make.inc.XLF
@ -8,31 +8,29 @@ SHELL = /bin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = xlc
+CC = xlc
 CFLAGS = -O3 -qnosave

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = xlf
-OPTS    = -O3 -qfixed -qnosave
+FC = xlf
+FFLAGS = -O3 -qfixed -qnosave
 # For -O2, add -qstrict=none
-DRVOPTS = $(OPTS)
-NOOPT   = -O0 -qfixed -qnosave
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -O0 -qfixed -qnosave

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = xlf
-LOADOPTS = -qnosave
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib

 #  Timer for the SECOND and DSECND routines
 #
@ -75,9 +73,9 @@ TIMER = EXT_ETIME_
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-#BLASLIB      = ../../librefblas.a
+#BLASLIB      = $(TOPSRCDIR)/librefblas.a
 BLASLIB      = -lessl
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.gfortran
+++ b/lapack-netlib/INSTALL/make.inc.gfortran
@ -8,10 +8,10 @@ SHELL = /bin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = gcc
+CC = gcc
 CFLAGS = -O3

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
@ -19,23 +19,21 @@ CFLAGS = -O3
 #  and handle these quantities appropriately. As a consequence, one
 #  should not compile LAPACK with flags such as -ffpe-trap=overflow.
 #
-FORTRAN = gfortran
-OPTS    = -O2 -frecursive
-DRVOPTS = $(OPTS)
-NOOPT   = -O0 -frecursive
+FC = gfortran
+FFLAGS = -O2 -frecursive
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -O0 -frecursive

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = gfortran
-LOADOPTS =
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib

 #  Timer for the SECOND and DSECND routines
 #
@ -78,8 +76,8 @@ TIMER = INT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.gfortran_debug
+++ b/lapack-netlib/INSTALL/make.inc.gfortran_debug
@ -8,10 +8,10 @@ SHELL = /bin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = gcc
+CC = gcc
 CFLAGS = -g

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
@ -19,23 +19,21 @@ CFLAGS = -g
 #  and handle these quantities appropriately. As a consequence, one
 #  should not compile LAPACK with flags such as -ffpe-trap=overflow.
 #
-FORTRAN = gfortran -fimplicit-none -g -frecursive
-OPTS    =
-DRVOPTS = $(OPTS)
-NOOPT   = -g -O0 -frecursive
+FC = gfortran
+FFLAGS = -fimplicit-none -g -frecursive
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = $(FFLAGS) -O0

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = gfortran -g
-LOADOPTS =
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib

 #  Timer for the SECOND and DSECND routines
 #
@ -78,8 +76,8 @@ TIMER = INT_CPU_TIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.ifort
+++ b/lapack-netlib/INSTALL/make.inc.ifort
@ -8,30 +8,28 @@ SHELL = /bin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = icc
+CC = icc
 CFLAGS = -O3

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = ifort
-OPTS    = -O3 -fp-model strict -assume protect_parens
-DRVOPTS = $(OPTS)
-NOOPT   = -O0 -fp-model strict -assume protect_parens
+FC = ifort
+FFLAGS = -O3 -fp-model strict -assume protect_parens
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -O0 -fp-model strict -assume protect_parens

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = ifort
-LOADOPTS =
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib

 #  Timer for the SECOND and DSECND routines
 #
@ -74,8 +72,8 @@ TIMER = EXT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.pgf95
+++ b/lapack-netlib/INSTALL/make.inc.pgf95
@ -8,30 +8,28 @@ SHELL = /bin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = pgcc
+CC = pgcc
 CFLAGS =

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = pgf95
-OPTS    = -O3
-DRVOPTS = $(OPTS)
-NOOPT   = -O0
+FC = pgf95
+FFLAGS = -O3
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -O0

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = $(FORTRAN)
-LOADOPTS =
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo

 #  Timer for the SECOND and DSECND routines
 #
@ -74,8 +72,8 @@ TIMER = INT_CPU_TIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/make.inc.pghpf
+++ b/lapack-netlib/INSTALL/make.inc.pghpf
@ -8,30 +8,28 @@ SHELL = /bin/sh

 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = pghpc
+CC = pghpc
 CFLAGS =

-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = pghpf
-OPTS    = -O4 -Mnohpfc -Mdclchk
-DRVOPTS = $(OPTS)
-NOOPT   = -Mnohpfc -Mdclchk
+FC = pghpf
+FFLAGS = -O4 -Mnohpfc -Mdclchk
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -Mnohpfc -Mdclchk

-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = pghpf
-LOADOPTS =
+LDFLAGS =

 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo

 #  Timer for the SECOND and DSECND routines
 #
@ -75,8 +73,8 @@ TIMER = EXT_ETIME
 #  possible.)
 #
 #BLASLIB      = -lessl
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
--- a/lapack-netlib/INSTALL/slamch.f
+++ b/lapack-netlib/INSTALL/slamch.f
@ -28,6 +28,7 @@
 *
 *> \param[in] CMACH
 *> \verbatim
+*>          CMACH is CHARACTER*1
 *>          Specifies the value to be returned by SLAMCH:
 *>          = 'E' or 'e',   SLAMCH := eps
 *>          = 'S' or 's ,   SLAMCH := sfmin
--- a/lapack-netlib/LAPACKE/CMakeLists.txt
+++ b/lapack-netlib/LAPACKE/CMakeLists.txt
@ -16,18 +16,16 @@ if(NOT FortranCInterface_GLOBAL_FOUND OR NOT FortranCInterface_MODULE_FOUND)
                 ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h)
 endif()

-if(WIN32 AND NOT UNIX)
-  add_definitions(-DHAVE_LAPACK_CONFIG_H -DLAPACK_COMPLEX_STRUCTURE)
-  message(STATUS "Windows BUILD")
-endif()
-
-get_directory_property(DirDefs COMPILE_DEFINITIONS)
-
 include_directories(include ${LAPACK_BINARY_DIR}/include)
 add_subdirectory(include)
 add_subdirectory(src)
 add_subdirectory(utils)

+option(LAPACKE_BUILD_SINGLE "Build LAPACKE single precision real" ON)
+option(LAPACKE_BUILD_DOUBLE "Build LAPACKE double precision real" ON)
+option(LAPACKE_BUILD_COMPLEX "Build LAPACKE single precision complex" ON)
+option(LAPACKE_BUILD_COMPLEX16 "Build LAPACKE double precision complex" ON)
+
 macro(append_subdir_files variable dirname)
  get_directory_property(holder DIRECTORY ${dirname} DEFINITION ${variable})
  foreach(depfile ${holder})
@ -35,8 +33,29 @@ macro(append_subdir_files variable dirname)
  endforeach()
 endmacro()

+message(STATUS "Build LAPACKE single precision real: ${LAPACKE_BUILD_SINGLE}")
+message(STATUS "Build LAPACKE double precision real: ${LAPACKE_BUILD_DOUBLE}")
+message(STATUS "Build LAPACKE single precision complex: ${LAPACKE_BUILD_COMPLEX}")
+message(STATUS "Build LAPACKE double precision complex: ${LAPACKE_BUILD_COMPLEX16}")
+
 append_subdir_files(LAPACKE_INCLUDE "include")
 append_subdir_files(SOURCES "src")
+if (LAPACKE_BUILD_SINGLE)
+  append_subdir_files(SOURCES_SINGLE "src")
+  list(APPEND SOURCES ${SOURCES_SINGLE})
+endif()
+if (LAPACKE_BUILD_DOUBLE)
+  append_subdir_files(SOURCES_DOUBLE "src")
+  list(APPEND SOURCES ${SOURCES_DOUBLE})
+endif()
+if (LAPACKE_BUILD_COMPLEX)
+  append_subdir_files(SOURCES_COMPLEX "src")
+  list(APPEND SOURCES ${SOURCES_COMPLEX})
+endif()
+if (LAPACKE_BUILD_COMPLEX16)
+  append_subdir_files(SOURCES_COMPLEX16 "src")
+  list(APPEND SOURCES ${SOURCES_COMPLEX16})
+endif()
 append_subdir_files(DEPRECATED "src")
 append_subdir_files(EXTENDED "src")
 append_subdir_files(MATGEN "src")
@ -61,9 +80,13 @@ set_target_properties(
  SOVERSION ${LAPACK_MAJOR_VERSION}
  )
 target_include_directories(lapacke PUBLIC
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  $<INSTALL_INTERFACE:include>
 )
+if(WIN32 AND NOT UNIX)
+  target_compile_definitions(lapacke PUBLIC HAVE_LAPACK_CONFIG_H LAPACK_COMPLEX_STRUCTURE)
+  message(STATUS "Windows BUILD")
+endif()

 if(LAPACKE_WITH_TMG)
  target_link_libraries(lapacke PRIVATE tmglib)
@ -71,7 +94,11 @@ endif()
 target_link_libraries(lapacke PRIVATE ${LAPACK_LIBRARIES})

 lapack_install_library(lapacke)
-install(FILES ${LAPACKE_INCLUDE} ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(
+  FILES ${LAPACKE_INCLUDE} ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  COMPONENT Development
+  )

 if(BUILD_TESTING)
  add_subdirectory(example)
@ -82,6 +109,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapacke.pc.in ${CMAKE_CURRENT_BINARY_
 install(FILES
  ${CMAKE_CURRENT_BINARY_DIR}/lapacke.pc
  DESTINATION ${PKG_CONFIG_DIR}
+  COMPONENT Development
  )

 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-version.cmake.in
@ -95,7 +123,10 @@ install(FILES
  ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/lapacke-config.cmake
  ${LAPACK_BINARY_DIR}/lapacke-config-version.cmake
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION}
+  COMPONENT Development
  )

 install(EXPORT lapacke-targets
-  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION})
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION}
+  COMPONENT Development
+  )
--- a/lapack-netlib/LAPACKE/Makefile
+++ b/lapack-netlib/LAPACKE/Makefile
@ -40,22 +40,26 @@
 # To clean everything including lapacke library type
 # 'make cleanall'
 #
-include ../make.inc
+TOPSRCDIR = ..
+include $(TOPSRCDIR)/make.inc

+.PHONY: all
 all: lapacke

+.PHONY: lapacke
 lapacke: include/lapacke_mangling.h
 	$(MAKE) -C src
 	$(MAKE) -C utils

 include/lapacke_mangling.h: include/lapacke_mangling_with_flags.h.in
-	cp $< $@
+	cp include/lapacke_mangling_with_flags.h.in $@

+.PHONY: lapacke_example
 lapacke_example: lapacke
 	$(MAKE) -C example

-#clean: cleanlib
-clean: cleanobj
+.PHONY: clean cleanobj cleanlib cleanexe
+clean:
 	$(MAKE) -C src clean
 	$(MAKE) -C utils clean
 	$(MAKE) -C example clean
@ -64,6 +68,6 @@ cleanobj:
 	$(MAKE) -C utils cleanobj
 	$(MAKE) -C example cleanobj
 cleanlib:
-	rm -f ../$(LAPACKELIB)
+	$(MAKE) -C src cleanlib
 cleanexe:
 	$(MAKE) -C example cleanexe
--- a/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in
+++ b/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in
@ -7,8 +7,11 @@ if(NOT TARGET lapacke)
  include("@LAPACK_BINARY_DIR@/lapack-targets.cmake")
 endif()

+# Hint for project building against lapack
+set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID})
+
 # Report lapacke header search locations from build tree.
 set(LAPACKE_INCLUDE_DIRS "@LAPACK_BINARY_DIR@/include")

 # Report lapacke libraries.
-set(LAPACKE_LIBRARIES lapacke)
+set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES})
--- a/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in
+++ b/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in
@ -13,11 +13,14 @@ if(NOT TARGET lapacke)
  include(${_LAPACKE_SELF_DIR}/lapacke-targets.cmake)
 endif()

+# Hint for project building against lapack
+set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID})
+
 # Report lapacke header search locations.
 set(LAPACKE_INCLUDE_DIRS ${_LAPACKE_PREFIX}/include)

 # Report lapacke libraries.
-set(LAPACKE_LIBRARIES lapacke)
+set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES})

 unset(_LAPACKE_PREFIX)
 unset(_LAPACKE_SELF_DIR)
--- a/lapack-netlib/LAPACKE/example/Makefile
+++ b/lapack-netlib/LAPACKE/example/Makefile
@ -1,34 +1,38 @@
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc

+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I. -I../include -c -o $@ $<
+
+.PHONY: all
 all: xexample_DGESV_rowmajor \
     xexample_DGESV_colmajor \
     xexample_DGELS_rowmajor \
     xexample_DGELS_colmajor

-LIBRARIES = ../../$(LAPACKELIB) ../../$(LAPACKLIB) $(BLASLIB)
+LIBRARIES = $(LAPACKELIB) $(LAPACKLIB) $(BLASLIB)

 # Double Precision Examples
 xexample_DGESV_rowmajor: example_DGESV_rowmajor.o lapacke_example_aux.o $(LIBRARIES)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 	./$@

 xexample_DGESV_colmajor: example_DGESV_colmajor.o lapacke_example_aux.o $(LIBRARIES)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 	./$@

 xexample_DGELS_rowmajor: example_DGELS_rowmajor.o lapacke_example_aux.o $(LIBRARIES)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 	./$@

 xexample_DGELS_colmajor: example_DGELS_colmajor.o lapacke_example_aux.o $(LIBRARIES)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 	./$@

+.PHONY: clean cleanobj cleanexe
 clean: cleanobj cleanexe
 cleanobj:
 	rm -f *.o
 cleanexe:
 	rm -f x*
-
-.c.o:
-	$(CC) $(CFLAGS) -I. -I../include -c -o $@ $<
--- a/lapack-netlib/LAPACKE/include/CMakeLists.txt
+++ b/lapack-netlib/LAPACKE/include/CMakeLists.txt
@ -1,3 +1,3 @@
-set(LAPACKE_INCLUDE lapacke.h lapacke_config.h lapacke_utils.h)
+set(LAPACKE_INCLUDE lapacke.h lapack.h lapacke_config.h lapacke_utils.h)

 file(COPY ${LAPACKE_INCLUDE} DESTINATION ${LAPACK_BINARY_DIR}/include)
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
--- a/lapack-netlib/LAPACKE/src/CMakeLists.txt
+++ b/lapack-netlib/LAPACKE/src/CMakeLists.txt
@ -1,4 +1,4 @@
-set(SOURCES
+set(SOURCES_COMPLEX
 lapacke_cbbcsd.c
 lapacke_cbbcsd_work.c
 lapacke_cbdsqr.c
@ -78,11 +78,11 @@ lapacke_cgeqrf_work.c
 lapacke_cgeqrfp.c
 lapacke_cgeqrfp_work.c
 lapacke_cgeqrt.c
+lapacke_cgeqrt_work.c
 lapacke_cgeqrt2.c
 lapacke_cgeqrt2_work.c
 lapacke_cgeqrt3.c
 lapacke_cgeqrt3_work.c
-lapacke_cgeqrt_work.c
 lapacke_cgerfs.c
 lapacke_cgerfs_work.c
 lapacke_cgerqf.c
@ -93,6 +93,8 @@ lapacke_cgesv.c
 lapacke_cgesv_work.c
 lapacke_cgesvd.c
 lapacke_cgesvd_work.c
+lapacke_cgesvdq.c
+lapacke_cgesvdq_work.c
 lapacke_cgesvdx.c
 lapacke_cgesvdx_work.c
 lapacke_cgesvj.c
@ -129,10 +131,10 @@ lapacke_cggevx.c
 lapacke_cggevx_work.c
 lapacke_cggglm.c
 lapacke_cggglm_work.c
-lapacke_cgghrd.c
-lapacke_cgghrd_work.c
 lapacke_cgghd3.c
 lapacke_cgghd3_work.c
+lapacke_cgghrd.c
+lapacke_cgghrd_work.c
 lapacke_cgglse.c
 lapacke_cgglse_work.c
 lapacke_cggqrf.c
@ -157,14 +159,14 @@ lapacke_cgttrs.c
 lapacke_cgttrs_work.c
 lapacke_chbev.c
 lapacke_chbev_work.c
-lapacke_chbevd.c
-lapacke_chbevd_work.c
-lapacke_chbevx.c
-lapacke_chbevx_work.c
 lapacke_chbev_2stage.c
 lapacke_chbev_2stage_work.c
+lapacke_chbevd.c
+lapacke_chbevd_work.c
 lapacke_chbevd_2stage.c
 lapacke_chbevd_2stage_work.c
+lapacke_chbevx.c
+lapacke_chbevx_work.c
 lapacke_chbevx_2stage.c
 lapacke_chbevx_2stage_work.c
 lapacke_chbgst.c
@ -185,18 +187,18 @@ lapacke_cheequb.c
 lapacke_cheequb_work.c
 lapacke_cheev.c
 lapacke_cheev_work.c
-lapacke_cheevd.c
-lapacke_cheevd_work.c
-lapacke_cheevr.c
-lapacke_cheevr_work.c
-lapacke_cheevx.c
-lapacke_cheevx_work.c
 lapacke_cheev_2stage.c
 lapacke_cheev_2stage_work.c
+lapacke_cheevd.c
+lapacke_cheevd_work.c
 lapacke_cheevd_2stage.c
 lapacke_cheevd_2stage_work.c
+lapacke_cheevr.c
+lapacke_cheevr_work.c
 lapacke_cheevr_2stage.c
 lapacke_cheevr_2stage_work.c
+lapacke_cheevx.c
+lapacke_cheevx_work.c
 lapacke_cheevx_2stage.c
 lapacke_cheevx_2stage_work.c
 lapacke_chegst.c
@ -214,8 +216,8 @@ lapacke_cherfs_work.c
 lapacke_chesv.c
 lapacke_chesv_work.c
 lapacke_chesv_aa.c
-lapacke_chesv_aa_2stage.c
 lapacke_chesv_aa_work.c
+lapacke_chesv_aa_2stage.c
 lapacke_chesv_aa_2stage_work.c
 lapacke_chesv_rk.c
 lapacke_chesv_rk_work.c
@ -226,35 +228,35 @@ lapacke_cheswapr_work.c
 lapacke_chetrd.c
 lapacke_chetrd_work.c
 lapacke_chetrf.c
-lapacke_chetrf_rook.c
 lapacke_chetrf_work.c
-lapacke_chetrf_rook_work.c
 lapacke_chetrf_aa.c
-lapacke_chetrf_aa_2stage.c
 lapacke_chetrf_aa_work.c
+lapacke_chetrf_aa_2stage.c
 lapacke_chetrf_aa_2stage_work.c
 lapacke_chetrf_rk.c
 lapacke_chetrf_rk_work.c
+lapacke_chetrf_rook.c
+lapacke_chetrf_rook_work.c
 lapacke_chetri.c
+lapacke_chetri_work.c
 lapacke_chetri2.c
 lapacke_chetri2_work.c
-lapacke_chetri_3.c
-lapacke_chetri_3_work.c
 lapacke_chetri2x.c
 lapacke_chetri2x_work.c
-lapacke_chetri_work.c
+lapacke_chetri_3.c
+lapacke_chetri_3_work.c
 lapacke_chetrs.c
-lapacke_chetrs_rook.c
+lapacke_chetrs_work.c
 lapacke_chetrs2.c
 lapacke_chetrs2_work.c
-lapacke_chetrs_work.c
-lapacke_chetrs_rook_work.c
-lapacke_chetrs_aa.c
-lapacke_chetrs_aa_2stage.c
-lapacke_chetrs_aa_work.c
-lapacke_chetrs_aa_2stage_work.c
 lapacke_chetrs_3.c
 lapacke_chetrs_3_work.c
+lapacke_chetrs_aa.c
+lapacke_chetrs_aa_work.c
+lapacke_chetrs_aa_2stage.c
+lapacke_chetrs_aa_2stage_work.c
+lapacke_chetrs_rook.c
+lapacke_chetrs_rook_work.c
 lapacke_chfrk.c
 lapacke_chfrk_work.c
 lapacke_chgeqz.c
@ -445,52 +447,54 @@ lapacke_csyconv.c
 lapacke_csyconv_work.c
 lapacke_csyequb.c
 lapacke_csyequb_work.c
+lapacke_csyr.c
+lapacke_csyr_work.c
 lapacke_csyrfs.c
 lapacke_csyrfs_work.c
 lapacke_csysv.c
-lapacke_csysv_rook.c
-lapacke_csysv_rook_work.c
 lapacke_csysv_work.c
 lapacke_csysv_aa.c
-lapacke_csysv_aa_2stage.c
 lapacke_csysv_aa_work.c
+lapacke_csysv_aa_2stage.c
 lapacke_csysv_aa_2stage_work.c
 lapacke_csysv_rk.c
 lapacke_csysv_rk_work.c
+lapacke_csysv_rook.c
+lapacke_csysv_rook_work.c
 lapacke_csysvx.c
 lapacke_csysvx_work.c
 lapacke_csyswapr.c
 lapacke_csyswapr_work.c
 lapacke_csytrf.c
 lapacke_csytrf_work.c
-lapacke_csytrf_rook.c
-lapacke_csytrf_rook_work.c
 lapacke_csytrf_aa.c
-lapacke_csytrf_aa_2stage.c
 lapacke_csytrf_aa_work.c
+lapacke_csytrf_aa_2stage.c
 lapacke_csytrf_aa_2stage_work.c
 lapacke_csytrf_rk.c
 lapacke_csytrf_rk_work.c
+lapacke_csytrf_rook.c
+lapacke_csytrf_rook_work.c
 lapacke_csytri.c
+lapacke_csytri_work.c
 lapacke_csytri2.c
 lapacke_csytri2_work.c
-lapacke_csytri_3.c
-lapacke_csytri_3_work.c
 lapacke_csytri2x.c
 lapacke_csytri2x_work.c
-lapacke_csytri_work.c
+lapacke_csytri_3.c
+lapacke_csytri_3_work.c
 lapacke_csytrs.c
-lapacke_csytrs_rook.c
+lapacke_csytrs_work.c
 lapacke_csytrs2.c
 lapacke_csytrs2_work.c
-lapacke_csytrs_work.c
-lapacke_csytrs_rook_work.c
-lapacke_csytrs_aa.c
-lapacke_csytrs_aa_2stage.c
-lapacke_csytrs_aa_work.c
-lapacke_csytrs_aa_2stage_work.c
 lapacke_csytrs_3.c
 lapacke_csytrs_3_work.c
+lapacke_csytrs_aa.c
+lapacke_csytrs_aa_work.c
+lapacke_csytrs_aa_2stage.c
+lapacke_csytrs_aa_2stage_work.c
+lapacke_csytrs_rook.c
+lapacke_csytrs_rook_work.c
 lapacke_ctbcon.c
 lapacke_ctbcon_work.c
 lapacke_ctbrfs.c
@ -522,9 +526,9 @@ lapacke_ctpcon_work.c
 lapacke_ctpmqrt.c
 lapacke_ctpmqrt_work.c
 lapacke_ctpqrt.c
+lapacke_ctpqrt_work.c
 lapacke_ctpqrt2.c
 lapacke_ctpqrt2_work.c
-lapacke_ctpqrt_work.c
 lapacke_ctprfb.c
 lapacke_ctprfb_work.c
 lapacke_ctprfs.c
@ -601,14 +605,16 @@ lapacke_cupgtr.c
 lapacke_cupgtr_work.c
 lapacke_cupmtr.c
 lapacke_cupmtr_work.c
+)
+set(SOURCES_DOUBLE
 lapacke_dbbcsd.c
 lapacke_dbbcsd_work.c
 lapacke_dbdsdc.c
 lapacke_dbdsdc_work.c
-lapacke_dbdsvdx.c
-lapacke_dbdsvdx_work.c
 lapacke_dbdsqr.c
 lapacke_dbdsqr_work.c
+lapacke_dbdsvdx.c
+lapacke_dbdsvdx_work.c
 lapacke_ddisna.c
 lapacke_ddisna_work.c
 lapacke_dgbbrd.c
@ -686,11 +692,11 @@ lapacke_dgeqrf_work.c
 lapacke_dgeqrfp.c
 lapacke_dgeqrfp_work.c
 lapacke_dgeqrt.c
+lapacke_dgeqrt_work.c
 lapacke_dgeqrt2.c
 lapacke_dgeqrt2_work.c
 lapacke_dgeqrt3.c
 lapacke_dgeqrt3_work.c
-lapacke_dgeqrt_work.c
 lapacke_dgerfs.c
 lapacke_dgerfs_work.c
 lapacke_dgerqf.c
@ -701,6 +707,8 @@ lapacke_dgesv.c
 lapacke_dgesv_work.c
 lapacke_dgesvd.c
 lapacke_dgesvd_work.c
+lapacke_dgesvdq.c
+lapacke_dgesvdq_work.c
 lapacke_dgesvdx.c
 lapacke_dgesvdx_work.c
 lapacke_dgesvj.c
@ -737,10 +745,10 @@ lapacke_dggevx.c
 lapacke_dggevx_work.c
 lapacke_dggglm.c
 lapacke_dggglm_work.c
-lapacke_dgghrd.c
-lapacke_dgghrd_work.c
 lapacke_dgghd3.c
 lapacke_dgghd3_work.c
+lapacke_dgghrd.c
+lapacke_dgghrd_work.c
 lapacke_dgglse.c
 lapacke_dgglse_work.c
 lapacke_dggqrf.c
@ -823,10 +831,10 @@ lapacke_dopmtr.c
 lapacke_dopmtr_work.c
 lapacke_dorbdb.c
 lapacke_dorbdb_work.c
-lapacke_dorcsd2by1.c
-lapacke_dorcsd2by1_work.c
 lapacke_dorcsd.c
 lapacke_dorcsd_work.c
+lapacke_dorcsd2by1.c
+lapacke_dorcsd2by1_work.c
 lapacke_dorgbr.c
 lapacke_dorgbr_work.c
 lapacke_dorghr.c
@ -933,14 +941,14 @@ lapacke_dpttrs.c
 lapacke_dpttrs_work.c
 lapacke_dsbev.c
 lapacke_dsbev_work.c
-lapacke_dsbevd.c
-lapacke_dsbevd_work.c
-lapacke_dsbevx.c
-lapacke_dsbevx_work.c
 lapacke_dsbev_2stage.c
 lapacke_dsbev_2stage_work.c
+lapacke_dsbevd.c
+lapacke_dsbevd_work.c
 lapacke_dsbevd_2stage.c
 lapacke_dsbevd_2stage_work.c
+lapacke_dsbevx.c
+lapacke_dsbevx_work.c
 lapacke_dsbevx_2stage.c
 lapacke_dsbevx_2stage_work.c
 lapacke_dsbgst.c
@ -1021,18 +1029,18 @@ lapacke_dsyequb.c
 lapacke_dsyequb_work.c
 lapacke_dsyev.c
 lapacke_dsyev_work.c
-lapacke_dsyevd.c
-lapacke_dsyevd_work.c
-lapacke_dsyevr.c
-lapacke_dsyevr_work.c
-lapacke_dsyevx.c
-lapacke_dsyevx_work.c
 lapacke_dsyev_2stage.c
 lapacke_dsyev_2stage_work.c
+lapacke_dsyevd.c
+lapacke_dsyevd_work.c
 lapacke_dsyevd_2stage.c
 lapacke_dsyevd_2stage_work.c
+lapacke_dsyevr.c
+lapacke_dsyevr_work.c
 lapacke_dsyevr_2stage.c
 lapacke_dsyevr_2stage_work.c
+lapacke_dsyevx.c
+lapacke_dsyevx_work.c
 lapacke_dsyevx_2stage.c
 lapacke_dsyevx_2stage_work.c
 lapacke_dsygst.c
@ -1048,15 +1056,15 @@ lapacke_dsygvx_work.c
 lapacke_dsyrfs.c
 lapacke_dsyrfs_work.c
 lapacke_dsysv.c
-lapacke_dsysv_rook.c
-lapacke_dsysv_rook_work.c
 lapacke_dsysv_work.c
 lapacke_dsysv_aa.c
-lapacke_dsysv_aa_2stage.c
 lapacke_dsysv_aa_work.c
+lapacke_dsysv_aa_2stage.c
 lapacke_dsysv_aa_2stage_work.c
 lapacke_dsysv_rk.c
 lapacke_dsysv_rk_work.c
+lapacke_dsysv_rook.c
+lapacke_dsysv_rook_work.c
 lapacke_dsysvx.c
 lapacke_dsysvx_work.c
 lapacke_dsyswapr.c
@ -1065,33 +1073,33 @@ lapacke_dsytrd.c
 lapacke_dsytrd_work.c
 lapacke_dsytrf.c
 lapacke_dsytrf_work.c
-lapacke_dsytrf_rook.c
-lapacke_dsytrf_rook_work.c
 lapacke_dsytrf_aa.c
-lapacke_dsytrf_aa_2stage.c
 lapacke_dsytrf_aa_work.c
+lapacke_dsytrf_aa_2stage.c
 lapacke_dsytrf_aa_2stage_work.c
 lapacke_dsytrf_rk.c
 lapacke_dsytrf_rk_work.c
+lapacke_dsytrf_rook.c
+lapacke_dsytrf_rook_work.c
 lapacke_dsytri.c
+lapacke_dsytri_work.c
 lapacke_dsytri2.c
 lapacke_dsytri2_work.c
-lapacke_dsytri_3.c
-lapacke_dsytri_3_work.c
 lapacke_dsytri2x.c
 lapacke_dsytri2x_work.c
-lapacke_dsytri_work.c
+lapacke_dsytri_3.c
+lapacke_dsytri_3_work.c
 lapacke_dsytrs.c
-lapacke_dsytrs_rook.c
+lapacke_dsytrs_work.c
 lapacke_dsytrs2.c
 lapacke_dsytrs2_work.c
-lapacke_dsytrs_aa.c
-lapacke_dsytrs_aa_2stage.c
-lapacke_dsytrs_aa_work.c
-lapacke_dsytrs_aa_2stage_work.c
 lapacke_dsytrs_3.c
 lapacke_dsytrs_3_work.c
-lapacke_dsytrs_work.c
+lapacke_dsytrs_aa.c
+lapacke_dsytrs_aa_work.c
+lapacke_dsytrs_aa_2stage.c
+lapacke_dsytrs_aa_2stage_work.c
+lapacke_dsytrs_rook.c
 lapacke_dsytrs_rook_work.c
 lapacke_dtbcon.c
 lapacke_dtbcon_work.c
@ -1124,9 +1132,9 @@ lapacke_dtpcon_work.c
 lapacke_dtpmqrt.c
 lapacke_dtpmqrt_work.c
 lapacke_dtpqrt.c
+lapacke_dtpqrt_work.c
 lapacke_dtpqrt2.c
 lapacke_dtpqrt2_work.c
-lapacke_dtpqrt_work.c
 lapacke_dtprfb.c
 lapacke_dtprfb_work.c
 lapacke_dtprfs.c
@ -1163,15 +1171,21 @@ lapacke_dtrttp.c
 lapacke_dtrttp_work.c
 lapacke_dtzrzf.c
 lapacke_dtzrzf_work.c
+)
+
+set(SOURCES
 lapacke_nancheck.c
+lapacke_ilaver.c
+)
+set(SOURCES_SINGLE
 lapacke_sbbcsd.c
 lapacke_sbbcsd_work.c
 lapacke_sbdsdc.c
 lapacke_sbdsdc_work.c
-lapacke_sbdsvdx.c
-lapacke_sbdsvdx_work.c
 lapacke_sbdsqr.c
 lapacke_sbdsqr_work.c
+lapacke_sbdsvdx.c
+lapacke_sbdsvdx_work.c
 lapacke_sdisna.c
 lapacke_sdisna_work.c
 lapacke_sgbbrd.c
@ -1249,11 +1263,11 @@ lapacke_sgeqrf_work.c
 lapacke_sgeqrfp.c
 lapacke_sgeqrfp_work.c
 lapacke_sgeqrt.c
+lapacke_sgeqrt_work.c
 lapacke_sgeqrt2.c
 lapacke_sgeqrt2_work.c
 lapacke_sgeqrt3.c
 lapacke_sgeqrt3_work.c
-lapacke_sgeqrt_work.c
 lapacke_sgerfs.c
 lapacke_sgerfs_work.c
 lapacke_sgerqf.c
@ -1264,6 +1278,8 @@ lapacke_sgesv.c
 lapacke_sgesv_work.c
 lapacke_sgesvd.c
 lapacke_sgesvd_work.c
+lapacke_sgesvdq.c
+lapacke_sgesvdq_work.c
 lapacke_sgesvdx.c
 lapacke_sgesvdx_work.c
 lapacke_sgesvj.c
@ -1300,10 +1316,10 @@ lapacke_sggevx.c
 lapacke_sggevx_work.c
 lapacke_sggglm.c
 lapacke_sggglm_work.c
-lapacke_sgghrd.c
-lapacke_sgghrd_work.c
 lapacke_sgghd3.c
 lapacke_sgghd3_work.c
+lapacke_sgghrd.c
+lapacke_sgghrd_work.c
 lapacke_sgglse.c
 lapacke_sgglse_work.c
 lapacke_sggqrf.c
@ -1496,14 +1512,14 @@ lapacke_spttrs.c
 lapacke_spttrs_work.c
 lapacke_ssbev.c
 lapacke_ssbev_work.c
-lapacke_ssbevd.c
-lapacke_ssbevd_work.c
-lapacke_ssbevx.c
-lapacke_ssbevx_work.c
 lapacke_ssbev_2stage.c
 lapacke_ssbev_2stage_work.c
+lapacke_ssbevd.c
+lapacke_ssbevd_work.c
 lapacke_ssbevd_2stage.c
 lapacke_ssbevd_2stage_work.c
+lapacke_ssbevx.c
+lapacke_ssbevx_work.c
 lapacke_ssbevx_2stage.c
 lapacke_ssbevx_2stage_work.c
 lapacke_ssbgst.c
@ -1580,18 +1596,18 @@ lapacke_ssyequb.c
 lapacke_ssyequb_work.c
 lapacke_ssyev.c
 lapacke_ssyev_work.c
-lapacke_ssyevd.c
-lapacke_ssyevd_work.c
-lapacke_ssyevr.c
-lapacke_ssyevr_work.c
-lapacke_ssyevx.c
-lapacke_ssyevx_work.c
 lapacke_ssyev_2stage.c
 lapacke_ssyev_2stage_work.c
+lapacke_ssyevd.c
+lapacke_ssyevd_work.c
 lapacke_ssyevd_2stage.c
 lapacke_ssyevd_2stage_work.c
+lapacke_ssyevr.c
+lapacke_ssyevr_work.c
 lapacke_ssyevr_2stage.c
 lapacke_ssyevr_2stage_work.c
+lapacke_ssyevx.c
+lapacke_ssyevx_work.c
 lapacke_ssyevx_2stage.c
 lapacke_ssyevx_2stage_work.c
 lapacke_ssygst.c
@ -1607,8 +1623,6 @@ lapacke_ssygvx_work.c
 lapacke_ssyrfs.c
 lapacke_ssyrfs_work.c
 lapacke_ssysv.c
-lapacke_ssysv_rook.c
-lapacke_ssysv_rook_work.c
 lapacke_ssysv_work.c
 lapacke_ssysv_aa.c
 lapacke_ssysv_aa_work.c
@ -1616,6 +1630,8 @@ lapacke_ssysv_aa_2stage.c
 lapacke_ssysv_aa_2stage_work.c
 lapacke_ssysv_rk.c
 lapacke_ssysv_rk_work.c
+lapacke_ssysv_rook.c
+lapacke_ssysv_rook_work.c
 lapacke_ssysvx.c
 lapacke_ssysvx_work.c
 lapacke_ssyswapr.c
@ -1624,33 +1640,33 @@ lapacke_ssytrd.c
 lapacke_ssytrd_work.c
 lapacke_ssytrf.c
 lapacke_ssytrf_work.c
-lapacke_ssytrf_rook.c
-lapacke_ssytrf_rook_work.c
 lapacke_ssytrf_aa.c
-lapacke_ssytrf_aa_2stage.c
 lapacke_ssytrf_aa_work.c
+lapacke_ssytrf_aa_2stage.c
 lapacke_ssytrf_aa_2stage_work.c
 lapacke_ssytrf_rk.c
 lapacke_ssytrf_rk_work.c
+lapacke_ssytrf_rook.c
+lapacke_ssytrf_rook_work.c
 lapacke_ssytri.c
+lapacke_ssytri_work.c
 lapacke_ssytri2.c
 lapacke_ssytri2_work.c
-lapacke_ssytri_3.c
-lapacke_ssytri_3_work.c
 lapacke_ssytri2x.c
 lapacke_ssytri2x_work.c
-lapacke_ssytri_work.c
+lapacke_ssytri_3.c
+lapacke_ssytri_3_work.c
 lapacke_ssytrs.c
-lapacke_ssytrs_rook.c
+lapacke_ssytrs_work.c
 lapacke_ssytrs2.c
 lapacke_ssytrs2_work.c
-lapacke_ssytrs_aa.c
-lapacke_ssytrs_aa_2stage.c
-lapacke_ssytrs_aa_work.c
-lapacke_ssytrs_aa_2stage_work.c
 lapacke_ssytrs_3.c
 lapacke_ssytrs_3_work.c
-lapacke_ssytrs_work.c
+lapacke_ssytrs_aa.c
+lapacke_ssytrs_aa_work.c
+lapacke_ssytrs_aa_2stage.c
+lapacke_ssytrs_aa_2stage_work.c
+lapacke_ssytrs_rook.c
 lapacke_ssytrs_rook_work.c
 lapacke_stbcon.c
 lapacke_stbcon_work.c
@ -1722,6 +1738,8 @@ lapacke_strttp.c
 lapacke_strttp_work.c
 lapacke_stzrzf.c
 lapacke_stzrzf_work.c
+)
+set(SOURCES_COMPLEX16
 lapacke_zbbcsd.c
 lapacke_zbbcsd_work.c
 lapacke_zbdsqr.c
@ -1805,11 +1823,11 @@ lapacke_zgeqrf_work.c
 lapacke_zgeqrfp.c
 lapacke_zgeqrfp_work.c
 lapacke_zgeqrt.c
+lapacke_zgeqrt_work.c
 lapacke_zgeqrt2.c
 lapacke_zgeqrt2_work.c
 lapacke_zgeqrt3.c
 lapacke_zgeqrt3_work.c
-lapacke_zgeqrt_work.c
 lapacke_zgerfs.c
 lapacke_zgerfs_work.c
 lapacke_zgerqf.c
@ -1820,6 +1838,8 @@ lapacke_zgesv.c
 lapacke_zgesv_work.c
 lapacke_zgesvd.c
 lapacke_zgesvd_work.c
+lapacke_zgesvdq.c
+lapacke_zgesvdq_work.c
 lapacke_zgesvdx.c
 lapacke_zgesvdx_work.c
 lapacke_zgesvj.c
@ -1856,10 +1876,10 @@ lapacke_zggevx.c
 lapacke_zggevx_work.c
 lapacke_zggglm.c
 lapacke_zggglm_work.c
-lapacke_zgghrd.c
-lapacke_zgghrd_work.c
 lapacke_zgghd3.c
 lapacke_zgghd3_work.c
+lapacke_zgghrd.c
+lapacke_zgghrd_work.c
 lapacke_zgglse.c
 lapacke_zgglse_work.c
 lapacke_zggqrf.c
@ -1884,14 +1904,14 @@ lapacke_zgttrs.c
 lapacke_zgttrs_work.c
 lapacke_zhbev.c
 lapacke_zhbev_work.c
-lapacke_zhbevd.c
-lapacke_zhbevd_work.c
-lapacke_zhbevx.c
-lapacke_zhbevx_work.c
 lapacke_zhbev_2stage.c
 lapacke_zhbev_2stage_work.c
+lapacke_zhbevd.c
+lapacke_zhbevd_work.c
 lapacke_zhbevd_2stage.c
 lapacke_zhbevd_2stage_work.c
+lapacke_zhbevx.c
+lapacke_zhbevx_work.c
 lapacke_zhbevx_2stage.c
 lapacke_zhbevx_2stage_work.c
 lapacke_zhbgst.c
@ -1912,18 +1932,18 @@ lapacke_zheequb.c
 lapacke_zheequb_work.c
 lapacke_zheev.c
 lapacke_zheev_work.c
-lapacke_zheevd.c
-lapacke_zheevd_work.c
-lapacke_zheevr.c
-lapacke_zheevr_work.c
-lapacke_zheevx.c
-lapacke_zheevx_work.c
 lapacke_zheev_2stage.c
 lapacke_zheev_2stage_work.c
+lapacke_zheevd.c
+lapacke_zheevd_work.c
 lapacke_zheevd_2stage.c
 lapacke_zheevd_2stage_work.c
+lapacke_zheevr.c
+lapacke_zheevr_work.c
 lapacke_zheevr_2stage.c
 lapacke_zheevr_2stage_work.c
+lapacke_zheevx.c
+lapacke_zheevx_work.c
 lapacke_zheevx_2stage.c
 lapacke_zheevx_2stage_work.c
 lapacke_zhegst.c
@ -1941,8 +1961,8 @@ lapacke_zherfs_work.c
 lapacke_zhesv.c
 lapacke_zhesv_work.c
 lapacke_zhesv_aa.c
-lapacke_zhesv_aa_2stage.c
 lapacke_zhesv_aa_work.c
+lapacke_zhesv_aa_2stage.c
 lapacke_zhesv_aa_2stage_work.c
 lapacke_zhesv_rk.c
 lapacke_zhesv_rk_work.c
@ -1953,34 +1973,34 @@ lapacke_zheswapr_work.c
 lapacke_zhetrd.c
 lapacke_zhetrd_work.c
 lapacke_zhetrf.c
-lapacke_zhetrf_rook.c
 lapacke_zhetrf_work.c
-lapacke_zhetrf_rook_work.c
 lapacke_zhetrf_aa.c
-lapacke_zhetrf_aa_2stage.c
 lapacke_zhetrf_aa_work.c
+lapacke_zhetrf_aa_2stage.c
 lapacke_zhetrf_aa_2stage_work.c
 lapacke_zhetrf_rk.c
 lapacke_zhetrf_rk_work.c
+lapacke_zhetrf_rook.c
+lapacke_zhetrf_rook_work.c
 lapacke_zhetri.c
+lapacke_zhetri_work.c
 lapacke_zhetri2.c
 lapacke_zhetri2_work.c
-lapacke_zhetri_3.c
-lapacke_zhetri_3_work.c
 lapacke_zhetri2x.c
 lapacke_zhetri2x_work.c
-lapacke_zhetri_work.c
+lapacke_zhetri_3.c
+lapacke_zhetri_3_work.c
 lapacke_zhetrs.c
-lapacke_zhetrs_rook.c
+lapacke_zhetrs_work.c
 lapacke_zhetrs2.c
 lapacke_zhetrs2_work.c
-lapacke_zhetrs_work.c
-lapacke_zhetrs_aa.c
-lapacke_zhetrs_aa_2stage.c
-lapacke_zhetrs_aa_work.c
-lapacke_zhetrs_aa_2stage_work.c
 lapacke_zhetrs_3.c
 lapacke_zhetrs_3_work.c
+lapacke_zhetrs_aa.c
+lapacke_zhetrs_aa_work.c
+lapacke_zhetrs_aa_2stage.c
+lapacke_zhetrs_aa_2stage_work.c
+lapacke_zhetrs_rook.c
 lapacke_zhetrs_rook_work.c
 lapacke_zhfrk.c
 lapacke_zhfrk_work.c
@ -2172,52 +2192,54 @@ lapacke_zsyconv.c
 lapacke_zsyconv_work.c
 lapacke_zsyequb.c
 lapacke_zsyequb_work.c
+lapacke_zsyr.c
+lapacke_zsyr_work.c
 lapacke_zsyrfs.c
 lapacke_zsyrfs_work.c
 lapacke_zsysv.c
-lapacke_zsysv_rook.c
-lapacke_zsysv_rook_work.c
 lapacke_zsysv_work.c
 lapacke_zsysv_aa.c
-lapacke_zsysv_aa_2stage.c
 lapacke_zsysv_aa_work.c
+lapacke_zsysv_aa_2stage.c
 lapacke_zsysv_aa_2stage_work.c
 lapacke_zsysv_rk.c
 lapacke_zsysv_rk_work.c
+lapacke_zsysv_rook.c
+lapacke_zsysv_rook_work.c
 lapacke_zsysvx.c
 lapacke_zsysvx_work.c
 lapacke_zsyswapr.c
 lapacke_zsyswapr_work.c
 lapacke_zsytrf.c
 lapacke_zsytrf_work.c
-lapacke_zsytrf_rook.c
-lapacke_zsytrf_rook_work.c
 lapacke_zsytrf_aa.c
-lapacke_zsytrf_aa_2stage.c
 lapacke_zsytrf_aa_work.c
+lapacke_zsytrf_aa_2stage.c
 lapacke_zsytrf_aa_2stage_work.c
 lapacke_zsytrf_rk.c
 lapacke_zsytrf_rk_work.c
+lapacke_zsytrf_rook.c
+lapacke_zsytrf_rook_work.c
 lapacke_zsytri.c
+lapacke_zsytri_work.c
 lapacke_zsytri2.c
 lapacke_zsytri2_work.c
-lapacke_zsytri_3.c
-lapacke_zsytri_3_work.c
 lapacke_zsytri2x.c
 lapacke_zsytri2x_work.c
-lapacke_zsytri_work.c
+lapacke_zsytri_3.c
+lapacke_zsytri_3_work.c
 lapacke_zsytrs.c
-lapacke_zsytrs_rook.c
+lapacke_zsytrs_work.c
 lapacke_zsytrs2.c
 lapacke_zsytrs2_work.c
-lapacke_zsytrs_work.c
-lapacke_zsytrs_rook_work.c
-lapacke_zsytrs_aa.c
-lapacke_zsytrs_aa_2stage.c
-lapacke_zsytrs_aa_work.c
-lapacke_zsytrs_aa_2stage_work.c
 lapacke_zsytrs_3.c
 lapacke_zsytrs_3_work.c
+lapacke_zsytrs_aa.c
+lapacke_zsytrs_aa_work.c
+lapacke_zsytrs_aa_2stage.c
+lapacke_zsytrs_aa_2stage_work.c
+lapacke_zsytrs_rook.c
+lapacke_zsytrs_rook_work.c
 lapacke_ztbcon.c
 lapacke_ztbcon_work.c
 lapacke_ztbrfs.c
@ -2249,9 +2271,9 @@ lapacke_ztpcon_work.c
 lapacke_ztpmqrt.c
 lapacke_ztpmqrt_work.c
 lapacke_ztpqrt.c
+lapacke_ztpqrt_work.c
 lapacke_ztpqrt2.c
 lapacke_ztpqrt2_work.c
-lapacke_ztpqrt_work.c
 lapacke_ztprfb.c
 lapacke_ztprfb_work.c
 lapacke_ztprfs.c
@ -2328,11 +2350,6 @@ lapacke_zupgtr.c
 lapacke_zupgtr_work.c
 lapacke_zupmtr.c
 lapacke_zupmtr_work.c
-lapacke_zsyr.c
-lapacke_csyr.c
-lapacke_zsyr_work.c
-lapacke_csyr_work.c
-lapacke_ilaver.c
 )

 set(DEPRECATED
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@ -32,12 +32,21 @@
 ##############################################################################
 # makefile for LAPACKE, used to build lapacke binary.
 #
-# Note: we use multiple OBJ_A, OBJ_B, etc, instead of a single OBJ
+# Note: we use multiple OBJ_S, OBJ_C, etc, instead of a single OBJ
 # to allow build with mingw (argument list too long for the msys ar)
 #
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc

-OBJ_A = \
+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I../include -c -o $@ $<
+
+OBJ = \
+lapacke_ilaver.o \
+lapacke_nancheck.o
+
+OBJ_C = \
 lapacke_cbbcsd.o \
 lapacke_cbbcsd_work.o \
 lapacke_cbdsqr.o \
@ -82,12 +91,12 @@ lapacke_cgeevx.o \
 lapacke_cgeevx_work.o \
 lapacke_cgehrd.o \
 lapacke_cgehrd_work.o \
+lapacke_cgejsv.o \
+lapacke_cgejsv_work.o \
 lapacke_cgelq.o \
 lapacke_cgelq_work.o \
 lapacke_cgelq2.o \
 lapacke_cgelq2_work.o \
-lapacke_cgejsv.o \
-lapacke_cgejsv_work.o \
 lapacke_cgelqf.o \
 lapacke_cgelqf_work.o \
 lapacke_cgels.o \
@ -117,11 +126,11 @@ lapacke_cgeqrf_work.o \
 lapacke_cgeqrfp.o \
 lapacke_cgeqrfp_work.o \
 lapacke_cgeqrt.o \
+lapacke_cgeqrt_work.o \
 lapacke_cgeqrt2.o \
 lapacke_cgeqrt2_work.o \
 lapacke_cgeqrt3.o \
 lapacke_cgeqrt3_work.o \
-lapacke_cgeqrt_work.o \
 lapacke_cgerfs.o \
 lapacke_cgerfs_work.o \
 lapacke_cgerqf.o \
@ -132,6 +141,8 @@ lapacke_cgesv.o \
 lapacke_cgesv_work.o \
 lapacke_cgesvd.o \
 lapacke_cgesvd_work.o \
+lapacke_cgesvdq.o \
+lapacke_cgesvdq_work.o \
 lapacke_cgesvdx.o \
 lapacke_cgesvdx_work.o \
 lapacke_cgesvj.o \
@ -168,10 +179,10 @@ lapacke_cggevx.o \
 lapacke_cggevx_work.o \
 lapacke_cggglm.o \
 lapacke_cggglm_work.o \
-lapacke_cgghrd.o \
-lapacke_cgghrd_work.o \
 lapacke_cgghd3.o \
 lapacke_cgghd3_work.o \
+lapacke_cgghrd.o \
+lapacke_cgghrd_work.o \
 lapacke_cgglse.o \
 lapacke_cgglse_work.o \
 lapacke_cggqrf.o \
@ -196,14 +207,14 @@ lapacke_cgttrs.o \
 lapacke_cgttrs_work.o \
 lapacke_chbev.o \
 lapacke_chbev_work.o \
-lapacke_chbevd.o \
-lapacke_chbevd_work.o \
-lapacke_chbevx.o \
-lapacke_chbevx_work.o \
 lapacke_chbev_2stage.o \
 lapacke_chbev_2stage_work.o \
+lapacke_chbevd.o \
+lapacke_chbevd_work.o \
 lapacke_chbevd_2stage.o \
 lapacke_chbevd_2stage_work.o \
+lapacke_chbevx.o \
+lapacke_chbevx_work.o \
 lapacke_chbevx_2stage.o \
 lapacke_chbevx_2stage_work.o \
 lapacke_chbgst.o \
@ -224,18 +235,18 @@ lapacke_cheequb.o \
 lapacke_cheequb_work.o \
 lapacke_cheev.o \
 lapacke_cheev_work.o \
-lapacke_cheevd.o \
-lapacke_cheevd_work.o \
-lapacke_cheevr.o \
-lapacke_cheevr_work.o \
-lapacke_cheevx.o \
-lapacke_cheevx_work.o \
 lapacke_cheev_2stage.o \
 lapacke_cheev_2stage_work.o \
+lapacke_cheevd.o \
+lapacke_cheevd_work.o \
 lapacke_cheevd_2stage.o \
 lapacke_cheevd_2stage_work.o \
+lapacke_cheevr.o \
+lapacke_cheevr_work.o \
 lapacke_cheevr_2stage.o \
 lapacke_cheevr_2stage_work.o \
+lapacke_cheevx.o \
+lapacke_cheevx_work.o \
 lapacke_cheevx_2stage.o \
 lapacke_cheevx_2stage_work.o \
 lapacke_chegst.o \
@ -265,35 +276,35 @@ lapacke_cheswapr_work.o \
 lapacke_chetrd.o \
 lapacke_chetrd_work.o \
 lapacke_chetrf.o \
-lapacke_chetrf_rook.o \
 lapacke_chetrf_work.o \
-lapacke_chetrf_rook_work.o \
 lapacke_chetrf_aa.o \
-lapacke_chetrf_aa_2stage.o \
 lapacke_chetrf_aa_work.o \
+lapacke_chetrf_aa_2stage.o \
 lapacke_chetrf_aa_2stage_work.o \
 lapacke_chetrf_rk.o \
 lapacke_chetrf_rk_work.o \
+lapacke_chetrf_rook.o \
+lapacke_chetrf_rook_work.o \
 lapacke_chetri.o \
+lapacke_chetri_work.o \
 lapacke_chetri2.o \
 lapacke_chetri2_work.o \
-lapacke_chetri_3.o \
-lapacke_chetri_3_work.o \
 lapacke_chetri2x.o \
 lapacke_chetri2x_work.o \
-lapacke_chetri_work.o \
+lapacke_chetri_3.o \
+lapacke_chetri_3_work.o \
 lapacke_chetrs.o \
-lapacke_chetrs_rook.o \
+lapacke_chetrs_work.o \
 lapacke_chetrs2.o \
 lapacke_chetrs2_work.o \
-lapacke_chetrs_work.o \
-lapacke_chetrs_rook_work.o \
-lapacke_chetrs_aa.o \
-lapacke_chetrs_aa_2stage.o \
-lapacke_chetrs_aa_work.o \
-lapacke_chetrs_aa_2stage_work.o \
 lapacke_chetrs_3.o \
 lapacke_chetrs_3_work.o \
+lapacke_chetrs_aa.o \
+lapacke_chetrs_aa_work.o \
+lapacke_chetrs_aa_2stage.o \
+lapacke_chetrs_aa_2stage_work.o \
+lapacke_chetrs_rook.o \
+lapacke_chetrs_rook_work.o \
 lapacke_chfrk.o \
 lapacke_chfrk_work.o \
 lapacke_chgeqz.o \
@ -484,11 +495,11 @@ lapacke_csyconv.o \
 lapacke_csyconv_work.o \
 lapacke_csyequb.o \
 lapacke_csyequb_work.o \
+lapacke_csyr.o \
+lapacke_csyr_work.o \
 lapacke_csyrfs.o \
 lapacke_csyrfs_work.o \
 lapacke_csysv.o \
-lapacke_csysv_rook.o \
-lapacke_csysv_rook_work.o \
 lapacke_csysv_work.o \
 lapacke_csysv_aa.o \
 lapacke_csysv_aa_work.o \
@ -496,40 +507,42 @@ lapacke_csysv_aa_2stage.o \
 lapacke_csysv_aa_2stage_work.o \
 lapacke_csysv_rk.o \
 lapacke_csysv_rk_work.o \
+lapacke_csysv_rook.o \
+lapacke_csysv_rook_work.o \
 lapacke_csysvx.o \
 lapacke_csysvx_work.o \
 lapacke_csyswapr.o \
 lapacke_csyswapr_work.o \
 lapacke_csytrf.o \
 lapacke_csytrf_work.o \
-lapacke_csytrf_rook.o \
-lapacke_csytrf_rook_work.o \
 lapacke_csytrf_aa.o \
-lapacke_csytrf_aa_2stage.o \
 lapacke_csytrf_aa_work.o \
+lapacke_csytrf_aa_2stage.o \
 lapacke_csytrf_aa_2stage_work.o \
 lapacke_csytrf_rk.o \
 lapacke_csytrf_rk_work.o \
+lapacke_csytrf_rook.o \
+lapacke_csytrf_rook_work.o \
 lapacke_csytri.o \
+lapacke_csytri_work.o \
 lapacke_csytri2.o \
 lapacke_csytri2_work.o \
-lapacke_csytri_3.o \
-lapacke_csytri_3_work.o \
 lapacke_csytri2x.o \
 lapacke_csytri2x_work.o \
-lapacke_csytri_work.o \
+lapacke_csytri_3.o \
+lapacke_csytri_3_work.o \
 lapacke_csytrs.o \
-lapacke_csytrs_rook.o \
+lapacke_csytrs_work.o \
 lapacke_csytrs2.o \
 lapacke_csytrs2_work.o \
-lapacke_csytrs_work.o \
-lapacke_csytrs_rook_work.o \
-lapacke_csytrs_aa.o \
-lapacke_csytrs_aa_2stage.o \
-lapacke_csytrs_aa_work.o \
-lapacke_csytrs_aa_2stage_work.o \
 lapacke_csytrs_3.o \
 lapacke_csytrs_3_work.o \
+lapacke_csytrs_aa.o \
+lapacke_csytrs_aa_work.o \
+lapacke_csytrs_aa_2stage.o \
+lapacke_csytrs_aa_2stage_work.o \
+lapacke_csytrs_rook.o \
+lapacke_csytrs_rook_work.o \
 lapacke_ctbcon.o \
 lapacke_ctbcon_work.o \
 lapacke_ctbrfs.o \
@ -561,9 +574,9 @@ lapacke_ctpcon_work.o \
 lapacke_ctpmqrt.o \
 lapacke_ctpmqrt_work.o \
 lapacke_ctpqrt.o \
+lapacke_ctpqrt_work.o \
 lapacke_ctpqrt2.o \
 lapacke_ctpqrt2_work.o \
-lapacke_ctpqrt_work.o \
 lapacke_ctprfb.o \
 lapacke_ctprfb_work.o \
 lapacke_ctprfs.o \
@ -639,15 +652,17 @@ lapacke_cunmtr_work.o \
 lapacke_cupgtr.o \
 lapacke_cupgtr_work.o \
 lapacke_cupmtr.o \
-lapacke_cupmtr_work.o \
+lapacke_cupmtr_work.o
+
+OBJ_D = \
 lapacke_dbbcsd.o \
 lapacke_dbbcsd_work.o \
 lapacke_dbdsdc.o \
 lapacke_dbdsdc_work.o \
-lapacke_dbdsvdx.o \
-lapacke_dbdsvdx_work.o \
 lapacke_dbdsqr.o \
 lapacke_dbdsqr_work.o \
+lapacke_dbdsvdx.o \
+lapacke_dbdsvdx_work.o \
 lapacke_ddisna.o \
 lapacke_ddisna_work.o \
 lapacke_dgbbrd.o \
@ -725,11 +740,11 @@ lapacke_dgeqrf_work.o \
 lapacke_dgeqrfp.o \
 lapacke_dgeqrfp_work.o \
 lapacke_dgeqrt.o \
+lapacke_dgeqrt_work.o \
 lapacke_dgeqrt2.o \
 lapacke_dgeqrt2_work.o \
 lapacke_dgeqrt3.o \
 lapacke_dgeqrt3_work.o \
-lapacke_dgeqrt_work.o \
 lapacke_dgerfs.o \
 lapacke_dgerfs_work.o \
 lapacke_dgerqf.o \
@ -740,6 +755,8 @@ lapacke_dgesv.o \
 lapacke_dgesv_work.o \
 lapacke_dgesvd.o \
 lapacke_dgesvd_work.o \
+lapacke_dgesvdq.o \
+lapacke_dgesvdq_work.o \
 lapacke_dgesvdx.o \
 lapacke_dgesvdx_work.o \
 lapacke_dgesvj.o \
@ -776,10 +793,10 @@ lapacke_dggevx.o \
 lapacke_dggevx_work.o \
 lapacke_dggglm.o \
 lapacke_dggglm_work.o \
-lapacke_dgghrd.o \
-lapacke_dgghrd_work.o \
 lapacke_dgghd3.o \
 lapacke_dgghd3_work.o \
+lapacke_dgghrd.o \
+lapacke_dgghrd_work.o \
 lapacke_dgglse.o \
 lapacke_dgglse_work.o \
 lapacke_dggqrf.o \
@ -972,14 +989,14 @@ lapacke_dpttrs.o \
 lapacke_dpttrs_work.o \
 lapacke_dsbev.o \
 lapacke_dsbev_work.o \
-lapacke_dsbevd.o \
-lapacke_dsbevd_work.o \
-lapacke_dsbevx.o \
-lapacke_dsbevx_work.o \
 lapacke_dsbev_2stage.o \
 lapacke_dsbev_2stage_work.o \
+lapacke_dsbevd.o \
+lapacke_dsbevd_work.o \
 lapacke_dsbevd_2stage.o \
 lapacke_dsbevd_2stage_work.o \
+lapacke_dsbevx.o \
+lapacke_dsbevx_work.o \
 lapacke_dsbevx_2stage.o \
 lapacke_dsbevx_2stage_work.o \
 lapacke_dsbgst.o \
@ -1060,18 +1077,18 @@ lapacke_dsyequb.o \
 lapacke_dsyequb_work.o \
 lapacke_dsyev.o \
 lapacke_dsyev_work.o \
-lapacke_dsyevd.o \
-lapacke_dsyevd_work.o \
-lapacke_dsyevr.o \
-lapacke_dsyevr_work.o \
-lapacke_dsyevx.o \
-lapacke_dsyevx_work.o \
 lapacke_dsyev_2stage.o \
 lapacke_dsyev_2stage_work.o \
+lapacke_dsyevd.o \
+lapacke_dsyevd_work.o \
 lapacke_dsyevd_2stage.o \
 lapacke_dsyevd_2stage_work.o \
+lapacke_dsyevr.o \
+lapacke_dsyevr_work.o \
 lapacke_dsyevr_2stage.o \
 lapacke_dsyevr_2stage_work.o \
+lapacke_dsyevx.o \
+lapacke_dsyevx_work.o \
 lapacke_dsyevx_2stage.o \
 lapacke_dsyevx_2stage_work.o \
 lapacke_dsygst.o \
@ -1087,8 +1104,6 @@ lapacke_dsygvx_work.o \
 lapacke_dsyrfs.o \
 lapacke_dsyrfs_work.o \
 lapacke_dsysv.o \
-lapacke_dsysv_rook.o \
-lapacke_dsysv_rook_work.o \
 lapacke_dsysv_work.o \
 lapacke_dsysv_aa.o \
 lapacke_dsysv_aa_work.o \
@ -1096,6 +1111,8 @@ lapacke_dsysv_aa_2stage.o \
 lapacke_dsysv_aa_2stage_work.o \
 lapacke_dsysv_rk.o \
 lapacke_dsysv_rk_work.o \
+lapacke_dsysv_rook.o \
+lapacke_dsysv_rook_work.o \
 lapacke_dsysvx.o \
 lapacke_dsysvx_work.o \
 lapacke_dsyswapr.o \
@ -1104,36 +1121,34 @@ lapacke_dsytrd.o \
 lapacke_dsytrd_work.o \
 lapacke_dsytrf.o \
 lapacke_dsytrf_work.o \
-lapacke_dsytrf_rook.o \
-lapacke_dsytrf_rook_work.o \
 lapacke_dsytrf_aa.o \
 lapacke_dsytrf_aa_work.o \
 lapacke_dsytrf_aa_2stage.o \
 lapacke_dsytrf_aa_2stage_work.o \
 lapacke_dsytrf_rk.o \
 lapacke_dsytrf_rk_work.o \
+lapacke_dsytrf_rook.o \
+lapacke_dsytrf_rook_work.o \
 lapacke_dsytri.o \
+lapacke_dsytri_work.o \
 lapacke_dsytri2.o \
 lapacke_dsytri2_work.o \
-lapacke_dsytri_3.o \
-lapacke_dsytri_3_work.o \
 lapacke_dsytri2x.o \
 lapacke_dsytri2x_work.o \
-lapacke_dsytri_work.o
-
-OBJ_B = \
+lapacke_dsytri_3.o \
+lapacke_dsytri_3_work.o \
 lapacke_dsytrs.o \
-lapacke_dsytrs_rook.o \
+lapacke_dsytrs_work.o \
 lapacke_dsytrs2.o \
 lapacke_dsytrs2_work.o \
-lapacke_dsytrs_work.o \
-lapacke_dsytrs_rook_work.o \
-lapacke_dsytrs_aa.o \
-lapacke_dsytrs_aa_2stage.o \
-lapacke_dsytrs_aa_work.o \
-lapacke_dsytrs_aa_2stage_work.o \
 lapacke_dsytrs_3.o \
 lapacke_dsytrs_3_work.o \
+lapacke_dsytrs_aa.o \
+lapacke_dsytrs_aa_work.o \
+lapacke_dsytrs_aa_2stage.o \
+lapacke_dsytrs_aa_2stage_work.o \
+lapacke_dsytrs_rook.o \
+lapacke_dsytrs_rook_work.o \
 lapacke_dtbcon.o \
 lapacke_dtbcon_work.o \
 lapacke_dtbrfs.o \
@ -1165,9 +1180,9 @@ lapacke_dtpcon_work.o \
 lapacke_dtpmqrt.o \
 lapacke_dtpmqrt_work.o \
 lapacke_dtpqrt.o \
+lapacke_dtpqrt_work.o \
 lapacke_dtpqrt2.o \
 lapacke_dtpqrt2_work.o \
-lapacke_dtpqrt_work.o \
 lapacke_dtprfb.o \
 lapacke_dtprfb_work.o \
 lapacke_dtprfs.o \
@ -1203,16 +1218,17 @@ lapacke_dtrttf_work.o \
 lapacke_dtrttp.o \
 lapacke_dtrttp_work.o \
 lapacke_dtzrzf.o \
-lapacke_dtzrzf_work.o \
-lapacke_nancheck.o  \
+lapacke_dtzrzf_work.o
+
+OBJ_S = \
 lapacke_sbbcsd.o \
 lapacke_sbbcsd_work.o \
 lapacke_sbdsdc.o \
 lapacke_sbdsdc_work.o \
-lapacke_sbdsvdx.o \
-lapacke_sbdsvdx_work.o \
 lapacke_sbdsqr.o \
 lapacke_sbdsqr_work.o \
+lapacke_sbdsvdx.o \
+lapacke_sbdsvdx_work.o \
 lapacke_sdisna.o \
 lapacke_sdisna_work.o \
 lapacke_sgbbrd.o \
@ -1290,11 +1306,11 @@ lapacke_sgeqrf_work.o \
 lapacke_sgeqrfp.o \
 lapacke_sgeqrfp_work.o \
 lapacke_sgeqrt.o \
+lapacke_sgeqrt_work.o \
 lapacke_sgeqrt2.o \
 lapacke_sgeqrt2_work.o \
 lapacke_sgeqrt3.o \
 lapacke_sgeqrt3_work.o \
-lapacke_sgeqrt_work.o \
 lapacke_sgerfs.o \
 lapacke_sgerfs_work.o \
 lapacke_sgerqf.o \
@ -1305,6 +1321,8 @@ lapacke_sgesv.o \
 lapacke_sgesv_work.o \
 lapacke_sgesvd.o \
 lapacke_sgesvd_work.o \
+lapacke_sgesvdq.o \
+lapacke_sgesvdq_work.o \
 lapacke_sgesvdx.o \
 lapacke_sgesvdx_work.o \
 lapacke_sgesvj.o \
@ -1341,10 +1359,10 @@ lapacke_sggevx.o \
 lapacke_sggevx_work.o \
 lapacke_sggglm.o \
 lapacke_sggglm_work.o \
-lapacke_sgghrd.o \
-lapacke_sgghrd_work.o \
 lapacke_sgghd3.o \
 lapacke_sgghd3_work.o \
+lapacke_sgghrd.o \
+lapacke_sgghrd_work.o \
 lapacke_sgglse.o \
 lapacke_sgglse_work.o \
 lapacke_sggqrf.o \
@ -1537,14 +1555,14 @@ lapacke_spttrs.o \
 lapacke_spttrs_work.o \
 lapacke_ssbev.o \
 lapacke_ssbev_work.o \
-lapacke_ssbevd.o \
-lapacke_ssbevd_work.o \
-lapacke_ssbevx.o \
-lapacke_ssbevx_work.o \
 lapacke_ssbev_2stage.o \
 lapacke_ssbev_2stage_work.o \
+lapacke_ssbevd.o \
+lapacke_ssbevd_work.o \
 lapacke_ssbevd_2stage.o \
 lapacke_ssbevd_2stage_work.o \
+lapacke_ssbevx.o \
+lapacke_ssbevx_work.o \
 lapacke_ssbevx_2stage.o \
 lapacke_ssbevx_2stage_work.o \
 lapacke_ssbgst.o \
@ -1621,18 +1639,18 @@ lapacke_ssyequb.o \
 lapacke_ssyequb_work.o \
 lapacke_ssyev.o \
 lapacke_ssyev_work.o \
-lapacke_ssyevd.o \
-lapacke_ssyevd_work.o \
-lapacke_ssyevr.o \
-lapacke_ssyevr_work.o \
-lapacke_ssyevx.o \
-lapacke_ssyevx_work.o \
 lapacke_ssyev_2stage.o \
 lapacke_ssyev_2stage_work.o \
+lapacke_ssyevd.o \
+lapacke_ssyevd_work.o \
 lapacke_ssyevd_2stage.o \
 lapacke_ssyevd_2stage_work.o \
+lapacke_ssyevr.o \
+lapacke_ssyevr_work.o \
 lapacke_ssyevr_2stage.o \
 lapacke_ssyevr_2stage_work.o \
+lapacke_ssyevx.o \
+lapacke_ssyevx_work.o \
 lapacke_ssyevx_2stage.o \
 lapacke_ssyevx_2stage_work.o \
 lapacke_ssygst.o \
@ -1648,8 +1666,6 @@ lapacke_ssygvx_work.o \
 lapacke_ssyrfs.o \
 lapacke_ssyrfs_work.o \
 lapacke_ssysv.o \
-lapacke_ssysv_rook.o \
-lapacke_ssysv_rook_work.o \
 lapacke_ssysv_work.o \
 lapacke_ssysv_aa.o \
 lapacke_ssysv_aa_work.o \
@ -1657,6 +1673,8 @@ lapacke_ssysv_aa_2stage.o \
 lapacke_ssysv_aa_2stage_work.o \
 lapacke_ssysv_rk.o \
 lapacke_ssysv_rk_work.o \
+lapacke_ssysv_rook.o \
+lapacke_ssysv_rook_work.o \
 lapacke_ssysvx.o \
 lapacke_ssysvx_work.o \
 lapacke_ssyswapr.o \
@ -1665,34 +1683,34 @@ lapacke_ssytrd.o \
 lapacke_ssytrd_work.o \
 lapacke_ssytrf.o \
 lapacke_ssytrf_work.o \
-lapacke_ssytrf_rook.o \
-lapacke_ssytrf_rook_work.o \
 lapacke_ssytrf_aa.o \
 lapacke_ssytrf_aa_work.o \
 lapacke_ssytrf_aa_2stage.o \
 lapacke_ssytrf_aa_2stage_work.o \
 lapacke_ssytrf_rk.o \
 lapacke_ssytrf_rk_work.o \
+lapacke_ssytrf_rook.o \
+lapacke_ssytrf_rook_work.o \
 lapacke_ssytri.o \
+lapacke_ssytri_work.o \
 lapacke_ssytri2.o \
 lapacke_ssytri2_work.o \
-lapacke_ssytri_3.o \
-lapacke_ssytri_3_work.o \
 lapacke_ssytri2x.o \
 lapacke_ssytri2x_work.o \
-lapacke_ssytri_work.o \
+lapacke_ssytri_3.o \
+lapacke_ssytri_3_work.o \
 lapacke_ssytrs.o \
-lapacke_ssytrs_rook.o \
+lapacke_ssytrs_work.o \
 lapacke_ssytrs2.o \
 lapacke_ssytrs2_work.o \
-lapacke_ssytrs_work.o \
-lapacke_ssytrs_rook_work.o \
-lapacke_ssytrs_aa.o \
-lapacke_ssytrs_aa_2stage.o \
-lapacke_ssytrs_aa_work.o \
-lapacke_ssytrs_aa_2stage_work.o \
 lapacke_ssytrs_3.o \
 lapacke_ssytrs_3_work.o \
+lapacke_ssytrs_aa.o \
+lapacke_ssytrs_aa_work.o \
+lapacke_ssytrs_aa_2stage.o \
+lapacke_ssytrs_aa_2stage_work.o \
+lapacke_ssytrs_rook.o \
+lapacke_ssytrs_rook_work.o \
 lapacke_stbcon.o \
 lapacke_stbcon_work.o \
 lapacke_stbrfs.o \
@ -1762,7 +1780,9 @@ lapacke_strttf_work.o \
 lapacke_strttp.o \
 lapacke_strttp_work.o \
 lapacke_stzrzf.o \
-lapacke_stzrzf_work.o \
+lapacke_stzrzf_work.o
+
+OBJ_Z = \
 lapacke_zbbcsd.o \
 lapacke_zbbcsd_work.o \
 lapacke_zbdsqr.o \
@ -1846,11 +1866,11 @@ lapacke_zgeqrf_work.o \
 lapacke_zgeqrfp.o \
 lapacke_zgeqrfp_work.o \
 lapacke_zgeqrt.o \
+lapacke_zgeqrt_work.o \
 lapacke_zgeqrt2.o \
 lapacke_zgeqrt2_work.o \
 lapacke_zgeqrt3.o \
 lapacke_zgeqrt3_work.o \
-lapacke_zgeqrt_work.o \
 lapacke_zgerfs.o \
 lapacke_zgerfs_work.o \
 lapacke_zgerqf.o \
@ -1861,6 +1881,8 @@ lapacke_zgesv.o \
 lapacke_zgesv_work.o \
 lapacke_zgesvd.o \
 lapacke_zgesvd_work.o \
+lapacke_zgesvdq.o \
+lapacke_zgesvdq_work.o \
 lapacke_zgesvdx.o \
 lapacke_zgesvdx_work.o \
 lapacke_zgesvj.o \
@ -1897,10 +1919,10 @@ lapacke_zggevx.o \
 lapacke_zggevx_work.o \
 lapacke_zggglm.o \
 lapacke_zggglm_work.o \
-lapacke_zgghrd.o \
-lapacke_zgghrd_work.o \
 lapacke_zgghd3.o \
 lapacke_zgghd3_work.o \
+lapacke_zgghrd.o \
+lapacke_zgghrd_work.o \
 lapacke_zgglse.o \
 lapacke_zgglse_work.o \
 lapacke_zggqrf.o \
@ -1925,14 +1947,14 @@ lapacke_zgttrs.o \
 lapacke_zgttrs_work.o \
 lapacke_zhbev.o \
 lapacke_zhbev_work.o \
-lapacke_zhbevd.o \
-lapacke_zhbevd_work.o \
-lapacke_zhbevx.o \
-lapacke_zhbevx_work.o \
 lapacke_zhbev_2stage.o \
 lapacke_zhbev_2stage_work.o \
+lapacke_zhbevd.o \
+lapacke_zhbevd_work.o \
 lapacke_zhbevd_2stage.o \
 lapacke_zhbevd_2stage_work.o \
+lapacke_zhbevx.o \
+lapacke_zhbevx_work.o \
 lapacke_zhbevx_2stage.o \
 lapacke_zhbevx_2stage_work.o \
 lapacke_zhbgst.o \
@ -1953,18 +1975,18 @@ lapacke_zheequb.o \
 lapacke_zheequb_work.o \
 lapacke_zheev.o \
 lapacke_zheev_work.o \
-lapacke_zheevd.o \
-lapacke_zheevd_work.o \
-lapacke_zheevr.o \
-lapacke_zheevr_work.o \
-lapacke_zheevx.o \
-lapacke_zheevx_work.o \
 lapacke_zheev_2stage.o \
 lapacke_zheev_2stage_work.o \
+lapacke_zheevd.o \
+lapacke_zheevd_work.o \
 lapacke_zheevd_2stage.o \
 lapacke_zheevd_2stage_work.o \
+lapacke_zheevr.o \
+lapacke_zheevr_work.o \
 lapacke_zheevr_2stage.o \
 lapacke_zheevr_2stage_work.o \
+lapacke_zheevx.o \
+lapacke_zheevx_work.o \
 lapacke_zheevx_2stage.o \
 lapacke_zheevx_2stage_work.o \
 lapacke_zhegst.o \
@ -1994,35 +2016,35 @@ lapacke_zheswapr_work.o \
 lapacke_zhetrd.o \
 lapacke_zhetrd_work.o \
 lapacke_zhetrf.o \
-lapacke_zhetrf_rook.o \
 lapacke_zhetrf_work.o \
-lapacke_zhetrf_rook_work.o \
 lapacke_zhetrf_aa.o \
-lapacke_zhetrf_aa_2stage.o \
 lapacke_zhetrf_aa_work.o \
+lapacke_zhetrf_aa_2stage.o \
 lapacke_zhetrf_aa_2stage_work.o \
 lapacke_zhetrf_rk.o \
 lapacke_zhetrf_rk_work.o \
+lapacke_zhetrf_rook.o \
+lapacke_zhetrf_rook_work.o \
 lapacke_zhetri.o \
+lapacke_zhetri_work.o \
 lapacke_zhetri2.o \
 lapacke_zhetri2_work.o \
-lapacke_zhetri_3.o \
-lapacke_zhetri_3_work.o \
 lapacke_zhetri2x.o \
 lapacke_zhetri2x_work.o \
-lapacke_zhetri_work.o \
+lapacke_zhetri_3.o \
+lapacke_zhetri_3_work.o \
 lapacke_zhetrs.o \
-lapacke_zhetrs_rook.o \
+lapacke_zhetrs_work.o \
 lapacke_zhetrs2.o \
 lapacke_zhetrs2_work.o \
-lapacke_zhetrs_work.o \
-lapacke_zhetrs_rook_work.o \
-lapacke_zhetrs_aa.o \
-lapacke_zhetrs_aa_2stage.o \
-lapacke_zhetrs_aa_work.o \
-lapacke_zhetrs_aa_2stage_work.o \
 lapacke_zhetrs_3.o \
 lapacke_zhetrs_3_work.o \
+lapacke_zhetrs_aa.o \
+lapacke_zhetrs_aa_work.o \
+lapacke_zhetrs_aa_2stage.o \
+lapacke_zhetrs_aa_2stage_work.o \
+lapacke_zhetrs_rook.o \
+lapacke_zhetrs_rook_work.o \
 lapacke_zhfrk.o \
 lapacke_zhfrk_work.o \
 lapacke_zhgeqz.o \
@ -2213,11 +2235,11 @@ lapacke_zsyconv.o \
 lapacke_zsyconv_work.o \
 lapacke_zsyequb.o \
 lapacke_zsyequb_work.o \
+lapacke_zsyr.o \
+lapacke_zsyr_work.o \
 lapacke_zsyrfs.o \
 lapacke_zsyrfs_work.o \
 lapacke_zsysv.o \
-lapacke_zsysv_rook.o \
-lapacke_zsysv_rook_work.o \
 lapacke_zsysv_work.o \
 lapacke_zsysv_aa.o \
 lapacke_zsysv_aa_work.o \
@ -2225,40 +2247,42 @@ lapacke_zsysv_aa_2stage.o \
 lapacke_zsysv_aa_2stage_work.o \
 lapacke_zsysv_rk.o \
 lapacke_zsysv_rk_work.o \
+lapacke_zsysv_rook.o \
+lapacke_zsysv_rook_work.o \
 lapacke_zsysvx.o \
 lapacke_zsysvx_work.o \
 lapacke_zsyswapr.o \
 lapacke_zsyswapr_work.o \
 lapacke_zsytrf.o \
 lapacke_zsytrf_work.o \
-lapacke_zsytrf_rook.o \
-lapacke_zsytrf_rook_work.o \
 lapacke_zsytrf_aa.o \
-lapacke_zsytrf_aa_2stage.o \
 lapacke_zsytrf_aa_work.o \
+lapacke_zsytrf_aa_2stage.o \
 lapacke_zsytrf_aa_2stage_work.o \
 lapacke_zsytrf_rk.o \
 lapacke_zsytrf_rk_work.o \
+lapacke_zsytrf_rook.o \
+lapacke_zsytrf_rook_work.o \
 lapacke_zsytri.o \
+lapacke_zsytri_work.o \
 lapacke_zsytri2.o \
 lapacke_zsytri2_work.o \
-lapacke_zsytri_3.o \
-lapacke_zsytri_3_work.o \
 lapacke_zsytri2x.o \
 lapacke_zsytri2x_work.o \
-lapacke_zsytri_work.o \
+lapacke_zsytri_3.o \
+lapacke_zsytri_3_work.o \
 lapacke_zsytrs.o \
-lapacke_zsytrs_rook.o \
+lapacke_zsytrs_work.o \
 lapacke_zsytrs2.o \
 lapacke_zsytrs2_work.o \
-lapacke_zsytrs_work.o \
-lapacke_zsytrs_rook_work.o \
-lapacke_zsytrs_aa.o \
-lapacke_zsytrs_aa_2stage.o \
-lapacke_zsytrs_aa_work.o \
-lapacke_zsytrs_aa_2stage_work.o \
 lapacke_zsytrs_3.o \
 lapacke_zsytrs_3_work.o \
+lapacke_zsytrs_aa.o \
+lapacke_zsytrs_aa_work.o \
+lapacke_zsytrs_aa_2stage.o \
+lapacke_zsytrs_aa_2stage_work.o \
+lapacke_zsytrs_rook.o \
+lapacke_zsytrs_rook_work.o \
 lapacke_ztbcon.o \
 lapacke_ztbcon_work.o \
 lapacke_ztbrfs.o \
@ -2290,9 +2314,9 @@ lapacke_ztpcon_work.o \
 lapacke_ztpmqrt.o \
 lapacke_ztpmqrt_work.o \
 lapacke_ztpqrt.o \
+lapacke_ztpqrt_work.o \
 lapacke_ztpqrt2.o \
 lapacke_ztpqrt2_work.o \
-lapacke_ztpqrt_work.o \
 lapacke_ztprfb.o \
 lapacke_ztprfb_work.o \
 lapacke_ztprfs.o \
@ -2368,12 +2392,7 @@ lapacke_zunmtr_work.o \
 lapacke_zupgtr.o \
 lapacke_zupgtr_work.o \
 lapacke_zupmtr.o \
-lapacke_zupmtr_work.o \
-lapacke_zsyr.o \
-lapacke_csyr.o \
-lapacke_zsyr_work.o \
-lapacke_csyr_work.o \
-lapacke_ilaver.o
+lapacke_zupmtr_work.o

 ifdef BUILD_DEPRECATED
 DEPRECATED = \
@ -2452,27 +2471,29 @@ lapacke_zlagsy.o \
 lapacke_zlagsy_work.o
 endif

-all: ../../$(LAPACKELIB)
+.PHONY: all
+all: $(LAPACKELIB)

-.PHONY: ../../$(LAPACKELIB)
-
-../../$(LAPACKELIB): $(OBJ_A) $(OBJ_B) $(DEPRECATED) $(EXTENDED) $(MATGEN)
-	$(ARCH) $(ARCHFLAGS) $@ $(OBJ_A)
-	$(ARCH) $(ARCHFLAGS) $@ $(OBJ_B)
+$(LAPACKELIB): $(OBJ) $(OBJ_S) $(OBJ_C) $(OBJ_D) $(OBJ_Z) $(DEPRECATED) $(EXTENDED) $(MATGEN)
+	$(AR) $(ARFLAGS) $@ $(OBJ)
+	$(AR) $(ARFLAGS) $@ $(OBJ_S)
+	$(AR) $(ARFLAGS) $@ $(OBJ_C)
+	$(AR) $(ARFLAGS) $@ $(OBJ_D)
+	$(AR) $(ARFLAGS) $@ $(OBJ_Z)
 ifdef BUILD_DEPRECATED
-	$(ARCH) $(ARCHFLAGS) $@ $(DEPRECATED)
+	$(AR) $(ARFLAGS) $@ $(DEPRECATED)
 endif
 ifdef (USEXBLAS)
-	$(ARCH) $(ARCHFLAGS) $@ $(EXTENDED)
+	$(AR) $(ARFLAGS) $@ $(EXTENDED)
 endif
 ifdef LAPACKE_WITH_TMG
-	$(ARCH) $(ARCHFLAGS) $@ $(MATGEN)
+	$(AR) $(ARFLAGS) $@ $(MATGEN)
 endif
 	$(RANLIB) $@

-clean: cleanobj
+.PHONY: clean cleanobj cleanlib
+clean: cleanobj cleanlib
 cleanobj:
 	rm -f *.o
-
-.c.o:
-	$(CC) $(CFLAGS) -I../include -c -o $@ $<
+cleanlib:
+	rm -f $(LAPACKELIB)
--- a/lapack-netlib/LAPACKE/src/lapacke_cgejsv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgejsv.c
@ -124,7 +124,6 @@ lapack_int LAPACKE_cgejsv( int matrix_layout, char joba, char jobu, char jobv,
    float* rwork = NULL;
    lapack_complex_float* cwork = NULL;
    lapack_int i;
-    lapack_int nu, nv;
    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
        LAPACKE_xerbla( "LAPACKE_cgejsv", -1 );
        return -1;
@ -132,8 +131,6 @@ lapack_int LAPACKE_cgejsv( int matrix_layout, char joba, char jobu, char jobv,
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        nu = LAPACKE_lsame( jobu, 'n' ) ? 1 : m;
-        nv = LAPACKE_lsame( jobv, 'n' ) ? 1 : n;
        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
            return -10;
        }
--- a/lapack-netlib/LAPACKE/src/lapacke_cgelsd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgelsd.c
@ -75,7 +75,7 @@ lapack_int LAPACKE_cgelsd( int matrix_layout, lapack_int m, lapack_int n,
    if( info != 0 ) {
        goto exit_level_0;
    }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
    lrwork = (lapack_int)rwork_query;
    lwork = LAPACK_C2INT( work_query );
    /* Allocate memory for work arrays */
--- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
@ -0,0 +1,106 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgesvdq
+* Author: Intel Corporation
+* Generated November 2018
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp,
+                           char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv, lapack_int* numrank)
+{
+    lapack_int info = 0;
+    lapack_int liwork = -1;
+    lapack_int* iwork = NULL;
+    lapack_int iwork_query;
+    lapack_int lcwork = -1;
+    lapack_complex_float* cwork = NULL;
+    lapack_complex_float cwork_query;
+    lapack_int lrwork = -1;
+    double* rwork = NULL;
+    double rwork_query;
+    lapack_int i;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv,
+                                 m, n, a, lda, s, u, ldu, v, ldv, numrank,
+                                 &iwork_query, liwork, &cwork_query, lcwork,
+                                 &rwork_query, lrwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    liwork = iwork_query;
+    lcwork = LAPACK_C2INT(cwork_query);
+    lrwork = (lapack_int)rwork_query;
+    /* Allocate memory for work arrays */
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    cwork = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lcwork );
+    if( cwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork );
+    if( rwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv,
+                                 m, n, a, lda, s, u, ldu, v, ldv, numrank,
+                                 iwork, liwork, cwork, lcwork, rwork, lrwork );
+
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+    LAPACKE_free( cwork );
+    LAPACKE_free( rwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgesvdq", info );
+    }
+    return info;
+}
--- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq_work.c
@ -0,0 +1,149 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgesvdq
+* Author: Intel Corporation
+* Generated November 2015
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgesvdq_work( int matrix_layout, char joba, char jobp,
+                           char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv, lapack_int* numrank,
+                           lapack_int* iwork, lapack_int liwork,
+                           lapack_complex_float* cwork, lapack_int lcwork,
+                           float* rwork, lapack_int lrwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda, s, u, &ldu, v, &ldv,
+                       numrank, iwork, &liwork, cwork, &lcwork, rwork, &lrwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int nrows_u = ( LAPACKE_lsame( jobu, 'a' ) ||
+                             LAPACKE_lsame( jobu, 's' ) ) ? m : 1;
+        lapack_int ncols_u = LAPACKE_lsame( jobu, 'a' ) ? m :
+                             (LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
+        lapack_int nrows_v = LAPACKE_lsame( jobv, 'a' ) ? n :
+                              ( LAPACKE_lsame( jobv, 's' ) ? MIN(m,n) : 1);
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldu_t = MAX(1,nrows_u);
+        lapack_int ldv_t = MAX(1,nrows_v);
+        lapack_complex_float* a_t = NULL;
+        lapack_complex_float* u_t = NULL;
+        lapack_complex_float* v_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info );
+            return info;
+        }
+        if( ldu < ncols_u ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lcwork == -1 ) {
+            LAPACK_cgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t,
+                             s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork,
+                             cwork, &lcwork, rwork, &lrwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            u_t = (lapack_complex_float*)
+                LAPACKE_malloc( sizeof(lapack_complex_float) * ldu_t * MAX(1,ncols_u) );
+            if( u_t == NULL ) {
+                info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+                goto exit_level_1;
+            }
+        }
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            v_t = (lapack_complex_float*)
+                LAPACKE_malloc( sizeof(lapack_complex_float) * ldv_t * MAX(1,n) );
+            if( v_t == NULL ) {
+                info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+                goto exit_level_2;
+            }
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+            LAPACK_cgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t,
+                             s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork,
+                             cwork, &lcwork, rwork, &lrwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrows_u, ncols_u, u_t, ldu_t,
+                               u, ldu );
+        }
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrows_v, n, v_t, ldv_t, v,
+                               ldv );
+        }
+        /* Release memory and exit */
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            LAPACKE_free( v_t );
+        }
+exit_level_2:
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            LAPACKE_free( u_t );
+        }
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info );
+    }
+    return info;
+}
--- a/lapack-netlib/LAPACKE/src/lapacke_cggesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cggesx.c
@ -91,7 +91,7 @@ lapack_int LAPACKE_cggesx( int matrix_layout, char jobvsl, char jobvsr,
    if( info != 0 ) {
        goto exit_level_2;
    }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
    lwork = LAPACK_C2INT( work_query );
    /* Allocate memory for work arrays */
    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
--- a/lapack-netlib/LAPACKE/src/lapacke_chbevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chbevd.c
@ -67,7 +67,7 @@ lapack_int LAPACKE_chbevd( int matrix_layout, char jobz, char uplo, lapack_int n
    if( info != 0 ) {
        goto exit_level_0;
    }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
    lrwork = (lapack_int)rwork_query;
    lwork = LAPACK_C2INT( work_query );
    /* Allocate memory for work arrays */
--- a/lapack-netlib/LAPACKE/src/lapacke_chbevd_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chbevd_2stage.c
@ -67,7 +67,7 @@ lapack_int LAPACKE_chbevd_2stage( int matrix_layout, char jobz, char uplo, lapac
    if( info != 0 ) {
        goto exit_level_0;
    }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
    lrwork = (lapack_int)rwork_query;
    lwork = LAPACK_C2INT( work_query );
    /* Allocate memory for work arrays */
--- a/lapack-netlib/LAPACKE/src/lapacke_chbgvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chbgvd.c
@ -71,7 +71,7 @@ lapack_int LAPACKE_chbgvd( int matrix_layout, char jobz, char uplo, lapack_int n
    if( info != 0 ) {
        goto exit_level_0;
    }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
    lrwork = (lapack_int)rwork_query;
    lwork = LAPACK_C2INT( work_query );
    /* Allocate memory for work arrays */
--- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
@ -70,7 +70,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo,
            goto exit_level_0;
        }
        /* Transpose input matrices */
-        LAPACKE_cge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
        /* Call LAPACK function and adjust info */
        LAPACK_cheev( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork,
                      &info );
@ -78,7 +78,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo,
            info = info - 1;
        }
        /* Transpose output matrices */
-        LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
        /* Release memory and exit */
        LAPACKE_free( a_t );
 exit_level_0:
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd.c
@ -53,7 +53,7 @@ lapack_int LAPACKE_cheevd( int matrix_layout, char jobz, char uplo, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -5;
        }
    }
@ -65,7 +65,7 @@ lapack_int LAPACKE_cheevd( int matrix_layout, char jobz, char uplo, lapack_int n
    if( info != 0 ) {
        goto exit_level_0;
    }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
    lrwork = (lapack_int)rwork_query;
    lwork = LAPACK_C2INT( work_query );
    /* Allocate memory for work arrays */
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage.c
@ -53,7 +53,7 @@ lapack_int LAPACKE_cheevd_2stage( int matrix_layout, char jobz, char uplo, lapac
 #ifndef LAPACK_DISABLE_NAN_CHECK
    if( LAPACKE_get_nancheck() ) {
        /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
            return -5;
        }
    }
@ -65,7 +65,7 @@ lapack_int LAPACKE_cheevd_2stage( int matrix_layout, char jobz, char uplo, lapac
    if( info != 0 ) {
        goto exit_level_0;
    }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
    lrwork = (lapack_int)rwork_query;
    lwork = LAPACK_C2INT( work_query );
    /* Allocate memory for work arrays */
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
@ -71,7 +71,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo,
            goto exit_level_0;
        }
        /* Transpose input matrices */
-        LAPACKE_cge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
        /* Call LAPACK function and adjust info */
        LAPACK_cheevd_2stage( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork,
                       &lrwork, iwork, &liwork, &info );
@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo,
            info = info - 1;
        }
        /* Transpose output matrices */
-        LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
        /* Release memory and exit */
        LAPACKE_free( a_t );
 exit_level_0:
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
@ -71,7 +71,7 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo,
            goto exit_level_0;
        }
        /* Transpose input matrices */
-        LAPACKE_cge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
        /* Call LAPACK function and adjust info */
        LAPACK_cheevd( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork,
                       &lrwork, iwork, &liwork, &info );
@ -79,7 +79,8 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo,
            info = info - 1;
        }
        /* Transpose output matrices */
-        LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+
        /* Release memory and exit */
        LAPACKE_free( a_t );
 exit_level_0:
--- a/Show More
+++ b/Show More