From ceb44bef1485bb664cee97113f8486d74ac59443 Mon Sep 17 00:00:00 2001 From: damonyu Date: Tue, 27 Apr 2021 11:12:29 +0800 Subject: [PATCH 001/108] update the intrinsic api to the offical name. --- kernel/riscv64/amax_vector.c | 97 +++++++++++-------- kernel/riscv64/amin_vector.c | 98 +++++++++++-------- kernel/riscv64/asum_vector.c | 88 +++++++++-------- kernel/riscv64/axpby_vector.c | 86 ++++++++--------- kernel/riscv64/axpy_vector.c | 46 +++++---- kernel/riscv64/copy_vector.c | 38 ++++---- kernel/riscv64/dot_vector.c | 97 ++++++++++--------- kernel/riscv64/gemv_n_vector.c | 38 ++++---- kernel/riscv64/gemv_t_vector.c | 84 ++++++++-------- kernel/riscv64/iamax_vector.c | 133 +++++++++++++------------ kernel/riscv64/iamin_vector.c | 133 +++++++++++++------------ kernel/riscv64/imax_vector.c | 117 +++++++++++----------- kernel/riscv64/imin_vector.c | 117 +++++++++++----------- kernel/riscv64/izamax_vector.c | 112 +++++++++++---------- kernel/riscv64/izamin_vector.c | 114 ++++++++++++---------- kernel/riscv64/max_vector.c | 72 +++++++------- kernel/riscv64/min_vector.c | 72 +++++++------- kernel/riscv64/nrm2_vector.c | 135 ++++++++++++++------------ kernel/riscv64/nrm2_vector_dot.c | 75 +++++++------- kernel/riscv64/rot_vector.c | 56 +++++------ kernel/riscv64/scal_vector.c | 48 ++++----- kernel/riscv64/swap_vector.c | 44 ++++----- kernel/riscv64/symv_L_vector.c | 112 ++++++++++----------- kernel/riscv64/symv_U_vector.c | 116 +++++++++++----------- kernel/riscv64/zamax_vector.c | 76 ++++++++------- kernel/riscv64/zamin_vector.c | 77 ++++++++------- kernel/riscv64/zasum_vector.c | 90 +++++++++-------- kernel/riscv64/zaxpby_vector.c | 54 +++++------ kernel/riscv64/zaxpy_vector.c | 30 +++--- kernel/riscv64/zcopy_vector.c | 22 ++--- kernel/riscv64/zdot_vector.c | 78 ++++++++------- kernel/riscv64/zgemv_n_vector.c | 38 ++++---- kernel/riscv64/zgemv_t_vector.c | 69 +++++++------ kernel/riscv64/zhemv_LM_vector.c | 79 ++++++++------- kernel/riscv64/zhemv_UV_vector.c | 79 ++++++++------- kernel/riscv64/znrm2_vector.c | 161 ++++++++++++++++--------------- kernel/riscv64/zrot_vector.c | 46 ++++----- kernel/riscv64/zscal_vector.c | 44 ++++----- kernel/riscv64/zswap_vector.c | 36 +++---- 39 files changed, 1628 insertions(+), 1479 deletions(-) diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index b6aec131e..5312f9ef0 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,19 +66,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_zero; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_zero = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; FLOAT zero = 0.0; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ BLASLONG inc_xv = inc_x * gvl; @@ -162,6 +175,7 @@ asm volatile( //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); #if defined(DOUBLE) asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e64,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -170,6 +184,7 @@ asm volatile( :"v0"); #else asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e32,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -185,6 +200,7 @@ asm volatile( //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); #if defined(DOUBLE) asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e64,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -193,6 +209,7 @@ asm volatile( :"v0"); #else asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e32,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -205,17 +222,17 @@ asm volatile( j += gvl*2; ix += inc_xv*2; } - v0 = VFMVVF_FLOAT(0, gvl); - v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl); - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); + maxf = v_res[0]; } for(;j maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } } diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index 53243ad56..ae2867ef8 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,11 +66,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); MASK_T mask0, mask1; - FLOAT zero = 0.0; + FLOAT zero = 0.0; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -61,39 +65,43 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); vy = VLEV_FLOAT(&y[j], gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else if(inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); vy = VLEV_FLOAT(&y[j], gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int stride_y = inc_y * sizeof(FLOAT); @@ -150,20 +156,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) j += gvl; } if(j > 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } } return(dot); diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index bd4d23eae..32ca8618b 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -27,23 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSEV_FLOAT vsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSEV_FLOAT vse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSEV_FLOAT vsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSEV_FLOAT vse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -57,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT_V_T va0, va1, vy0, vy1; unsigned int gvl = 0; if(inc_y == 1){ - gvl = vsetvli(m, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m); if(gvl <= m/2){ for(k=0,j=0; k maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -135,7 +142,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -145,35 +152,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index 608f19a00..5bcffece5 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -32,49 +32,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -89,42 +93,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); @@ -136,7 +143,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -146,35 +153,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index 44af7101b..42705f5de 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -85,8 +89,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_min; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ @@ -94,27 +103,25 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max = VLEV_FLOAT(&x[j], gvl); - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -126,7 +133,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -137,28 +144,26 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index e6e0e9f9f..3afa74dd6 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -85,15 +89,20 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -113,26 +122,24 @@ asm volatile( :"v0"); #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min = VLEV_FLOAT(&x[j], gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); @@ -143,7 +150,7 @@ asm volatile( } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -154,7 +161,7 @@ asm volatile( //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -175,27 +182,25 @@ asm volatile( #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index 62c95d973..ddb5eabde 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -30,47 +30,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define RVV_EFLOAT RVV_E64 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif #define RVV_M RVV_M8 @@ -86,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_max_index; MASK_T mask0, mask1; unsigned int gvl = 0; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); v_max = VFMVVF_FLOAT(-1, gvl); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -96,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -119,7 +130,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -143,7 +154,7 @@ asm volatile( //index where element greater than v_max mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl); + v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -163,7 +174,7 @@ asm volatile( :"v0"); #endif */ - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl); + v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); @@ -171,19 +182,19 @@ asm volatile( ix += inc_xv; } vx0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); - maxf = vx0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + maxf = v_res[0]; mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask0,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max_index = VMVVX_UINT(0, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -206,7 +217,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -227,9 +238,8 @@ asm volatile( #endif */ v_max = VFADDVV_FLOAT(vx0, vx1, gvl); - vx0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); - FLOAT cur_maxf = vx0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 38eccf1b5..6e328dc31 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -31,50 +31,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define RVV_EFLOAT RVV_E64 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif -#define RVV_M RVV_M8 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -87,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask0, mask1; unsigned int gvl = 0; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + + gvl = VSETVL(n); v_min_index = VMVVX_UINT(0, gvl); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -97,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -120,7 +130,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -144,7 +154,7 @@ asm volatile( //index where element less than v_min mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl); + v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -164,27 +174,26 @@ asm volatile( :"v0"); #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl); + v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, gvl); j += gvl; ix += inc_xv; } - vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); - vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); - minf = vx0[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask0,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min_index = VMVVX_UINT(0, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -207,7 +216,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -228,9 +237,8 @@ asm volatile( #endif */ v_min = VFADDVV_FLOAT(vx0, vx1, gvl); - vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); - vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); - FLOAT cur_minf = vx0[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index 4ef75452d..0fc59b74c 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=-FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_min; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); @@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl * 2; idx += inc_xv * 2; } - v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); - v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl); - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; } for(;j maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } } diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index 83c965bfa..8223fa87a 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VSEV_FLOAT vsev_float32xm8 -#define VSSEV_FLOAT vssev_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VSEV_FLOAT vse_v_f32m8 +#define VSSEV_FLOAT vsse_v_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VSEV_FLOAT vsev_float64xm8 -#define VSSEV_FLOAT vssev_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VSEV_FLOAT vse_v_f64m8 +#define VSSEV_FLOAT vsse_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ for(i=0,j=0; i 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < len / gvl; k++){ va = VLEV_FLOAT(&a_ptr[i], gvl); @@ -89,11 +97,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -101,9 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += alpha * temp2; @@ -121,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < len / gvl; k++){ @@ -136,11 +142,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -148,9 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += alpha * temp2; @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); vr = VFMVVF_FLOAT(0, gvl); inc_xv = inc_x * gvl; for(k = 0; k < len / gvl; k++){ @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += alpha * temp2; @@ -220,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_xv = inc_x * gvl; inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); @@ -237,11 +239,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -249,9 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += alpha * temp2; diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 29e0e4b65..7229a48b1 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -27,33 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSEV_FLOAT vsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFDOTVV_FLOAT vfdotvv_float32xm4 -#define VFMULVV_FLOAT vfmulvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSEV_FLOAT vse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFDOTVV_FLOAT vfdot_vv_f32m4 +#define VFMULVV_FLOAT vfmul_vv_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSEV_FLOAT vsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFDOTVV_FLOAT vfdotvv_float64xm4 -#define VFMULVV_FLOAT vfmulvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSEV_FLOAT vse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFDOTVV_FLOAT vfdot_vv_f64m4 +#define VFMULVV_FLOAT vfmul_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -65,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT temp2; FLOAT *a_ptr = a; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); FLOAT_V_T va, vx, vy, vr; BLASLONG stride_x, stride_y, inc_xv, inc_yv; @@ -78,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA temp2 = 0.0; if(j > 0){ i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ vy = VLEV_FLOAT(&y[i], gvl); @@ -91,11 +99,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -103,9 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -122,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA if(j > 0){ iy = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ @@ -137,11 +143,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -149,9 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA if(j > 0){ ix = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = inc_x * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -219,7 +221,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix = 0; iy = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = inc_x * gvl; inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); @@ -236,11 +238,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -248,9 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index a6c742b14..5cd65b225 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,19 +66,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_max = VFMVVF_FLOAT(0, gvl); BLASLONG inc_xv = inc_x * gvl * 2; for(; i maxf) - maxf = v_max[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; } return(maxf); } diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index 44a7cf1dc..9d567b3da 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -63,18 +67,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); BLASLONG inc_xv = inc_x * gvl * 2; for(; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -61,40 +65,44 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; if(inc_x == 1){ BLASLONG n2 = n * 2; - gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n2); v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n2/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_xv = incx * gvl * 2; inc_yv = incy * gvl * 2; inc_av = gvl * 2; @@ -134,13 +141,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 = vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 = vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 = v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -173,11 +179,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 += vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 += vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 += v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 += v_res[0]; } } y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 6fe12c76c..40cd9cd64 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFMULVV_FLOAT vfmulvv_float32xm4 -#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 -#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFMULVV_FLOAT vfmulvv_float64xm4 -#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 -#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B FLOAT temp_r2, temp_i2; FLOAT *a_ptr = a; unsigned int gvl = 0; - + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; @@ -89,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B ia = 0; i = 0; if(j > 0){ - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = incx * gvl * 2; inc_yv = incy * gvl * 2; inc_av = gvl * 2; @@ -133,13 +140,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 = vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 = vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 = v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -172,11 +178,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 += vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 += vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 += v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 += v_res[0]; } } y[jy] += temp_r1 * a_ptr[ja]; diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index b0ebfa5f4..5ac62eb80 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFDOTVV_FLOAT vfdot_vv_f32m4 #define ABS fabsf -#define MASK_T e32xm4_t -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 -#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 -#define VMFIRSTM vmfirstm_e32xm4 -#define VFDIVVF_FLOAT vfdivvf_float32xm4 -#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#define MASK_T vbool8_t +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m +#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 +#define VMFIRSTM vmfirst_m_b8 +#define VFDIVVF_FLOAT vfdiv_vf_f32m4 +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFDOTVV_FLOAT vfdot_vv_f64m4 #define ABS fabs -#define MASK_T e64xm4_t -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 -#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 -#define VMFIRSTM vmfirstm_e64xm4 -#define VFDIVVF_FLOAT vfdivvf_float64xm4 -#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#define MASK_T vbool16_t +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m +#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 +#define VMFIRSTM vmfirst_m_b16 +#define VFDIVVF_FLOAT vfdiv_vf_f64m4 +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -73,19 +77,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T vr, v0, v_zero; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT scale = 0.0, ssq = 0.0; MASK_T mask; BLASLONG index = 0; if(inc_x == 1){ BLASLONG n2 = n * 2; - gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n2); vr = VFMVVF_FLOAT(0, gvl); v_zero = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VSEV_FLOAT vsev_float32xm8 -#define VSSEV_FLOAT vssev_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VSEV_FLOAT vse_v_f32m8 +#define VSSEV_FLOAT vsse_v_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VSEV_FLOAT vsev_float64xm8 -#define VSSEV_FLOAT vssev_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VSEV_FLOAT vse_v_f64m8 +#define VSSEV_FLOAT vsse_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG n2 = n * 2; if(gvl <= n2/2){ for(i=0,j=0; i Date: Wed, 28 Apr 2021 13:56:06 +0000 Subject: [PATCH 002/108] GEMM: skylake: improve the performance when m is small --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 79 ++++++++++++++++++++-- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 9f2bf24e2..15185d7fc 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -149,6 +149,7 @@ #define KERNEL_h_k1m16n2 \ "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\ unit_acc_m16n2(8,9,10,11,%1) + #endif #define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1) @@ -283,7 +284,32 @@ #define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_acc_m4n2(12,13,%%r15,%%r12,1) #define KERNEL_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;" #define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_acc_m4n2(14,15,%%r15,%%r12,2) -#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +//#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +#define unit_acc_k2m4n2(c1_no,c2_no,...)\ + "vbroadcastf64x4 ("#__VA_ARGS__"),%%zmm3; vpermpd %%zmm3,%%zmm30,%%zmm3;"\ + "vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";" + +#define unit_merge_to_ymm(c1_no) \ + "vextractf64x4 $1,%%zmm"#c1_no",%%ymm30; vaddpd %%ymm"#c1_no",%%ymm30,%%ymm"#c1_no";" + +#define KERNEL_k1m4n12 \ + "cmpq $2, %5; jb 104912f;"\ + "vmovupd 64+%11,%%zmm30;"\ + "\n204912:"\ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \ + unit_acc_k2m4n2(4,5,%1) unit_acc_k2m4n2(6,7,%1,%%r12,1) unit_acc_k2m4n2(8, 9, %1, %%r12, 2) "addq $32,%1;" \ + unit_acc_k2m4n2(10,11,%%r15) unit_acc_k2m4n2(12,13,%%r15,%%r12,1) unit_acc_k2m4n2(14,15,%%r15,%%r12,2) "addq $32,%%r15;" \ + "subq $2, %5; cmpq $2, %5; jnb 204912b;"\ + unit_merge_to_ymm(4) unit_merge_to_ymm(5) unit_merge_to_ymm(6) unit_merge_to_ymm(7) \ + unit_merge_to_ymm(8) unit_merge_to_ymm(9) unit_merge_to_ymm(10) unit_merge_to_ymm(11) \ + unit_merge_to_ymm(12) unit_merge_to_ymm(13) unit_merge_to_ymm(14) unit_merge_to_ymm(15) \ + "testq %5, %5; jz 1004912f;"\ + "\n104912:"\ + KERNEL_h_k1m4n12 "addq $16,%%r15;"\ + "decq %5; jnz 104912b;"\ + "\n1004912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;" #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) @@ -336,7 +362,31 @@ #define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_acc_m2n2(12,13,%%r15,%%r12,1) #define KERNEL_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;" #define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_acc_m2n2(14,15,%%r15,%%r12,2) -#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" +//#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" + +#define unit_acc_k4m2n2(c1_no,c2_no,...) \ + "vmovupd ("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";" + +#define unit_merge_to_xmm(c1_no) \ + "vextractf64x2 $0,%%zmm"#c1_no",%%xmm20; vextractf64x2 $1,%%zmm"#c1_no",%%xmm21; vextractf64x2 $2,%%zmm"#c1_no",%%xmm22; vextractf64x2 $3,%%zmm"#c1_no",%%xmm23;"\ + "vaddpd %%xmm20,%%xmm21,%%xmm20; vaddpd %%xmm22,%%xmm23,%%xmm22; vaddpd %%xmm20,%%xmm22,%%xmm"#c1_no";" + +#define KERNEL_k1m2n12 \ + "cmpq $4,%5; jb 102912f;"\ + "\n402912:"\ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \ + unit_acc_k4m2n2(4,5,%1) unit_acc_k4m2n2(6,7,%1,%%r12,1) unit_acc_k4m2n2(8,9,%1,%%r12,2) "addq $64,%1;" \ + unit_acc_k4m2n2(10,11,%%r15) unit_acc_k4m2n2(12,13,%%r15,%%r12,1) unit_acc_k4m2n2(14,15,%%r15,%%r12,2) "addq $64,%%r15;" \ + "subq $4,%5; cmpq $4,%5; jnb 402912b;"\ + unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \ + unit_merge_to_xmm(10) unit_merge_to_xmm(11) unit_merge_to_xmm(12) unit_merge_to_xmm(13) unit_merge_to_xmm(14) unit_merge_to_xmm(15) \ + "testq %5,%5; jz 1002912f;"\ + "\n102912:"\ + KERNEL_h_k1m2n12 "addq $16,%%r15;" \ + "decq %5; jnz 102912b;" \ + "\n1002912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;" #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) @@ -387,7 +437,24 @@ #define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 "vfmadd231pd (%%r15,%%r12,1),%%xmm1,%%xmm8;" #define KERNEL_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;" #define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 "vfmadd231pd (%%r15,%%r12,2),%%xmm1,%%xmm9;" -#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +//#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +#define KERNEL_k1m1n12 \ + "cmpq $4,%5; jb 101912f;" \ + "vmovupd %11,%%zmm2;"\ + "\n401912:"\ + "vmovupd (%0),%%ymm1; vpermpd %%zmm1,%%zmm2,%%zmm1; addq $32,%0;" \ + "vfmadd231pd (%1),%%zmm1,%%zmm4; vfmadd231pd (%1,%%r12,1),%%zmm1,%%zmm5; vfmadd231pd (%1,%%r12,2),%%zmm1,%%zmm6; addq $64,%1;"\ + "vfmadd231pd (%%r15),%%zmm1,%%zmm7; vfmadd231pd (%%r15,%%r12,1),%%zmm1,%%zmm8; vfmadd231pd (%%r15,%%r12,2),%%zmm1,%%zmm9; addq $64,%%r15;"\ + "subq $4,%5; cmpq $4,%5; jnb 401912b;"\ + unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) \ + unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \ + "testq %5,%5; jz 1001912f;"\ + "\n101912:"\ + KERNEL_h_k1m1n12 "addq $16,%%r15;" \ + "decq %5; jnz 101912b;" \ + "\n1001912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;" #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" @@ -480,7 +547,7 @@ COMPUTE_SIMPLE(1,ndim) "subq $1,%%r11;"\ #ndim"33106:\n\t"\ "movq %%r14,%1;"\ - :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K):"r10","r11","r12","r13","r14","r15","cc","memory",\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K), "o"(permute_table):"r10","r11","r12","r13","r14","r15","cc","memory",\ "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\ "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\ a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ndim * ldc - M; TAIL_SET_OFF(ndim)\ @@ -501,6 +568,10 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; BLASLONG n_count = n, off = 0; double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*b_pref = B; + int64_t permute_table[] = { + 0, 0, 1, 1, 2, 2, 3, 3, // abcdxxxx -> aabbccdd + 0, 1, 0, 1, 2, 3, 2, 3, // abcdxxxx -> ababcdcd + }; #ifdef TRMMKERNEL #ifdef LEFT off = offset; From c59652f0ce88ea7bba97704f332c3ec77bd528c9 Mon Sep 17 00:00:00 2001 From: pnp Date: Fri, 30 Apr 2021 12:14:58 -0400 Subject: [PATCH 003/108] optimize on sgemv_n for small n --- kernel/x86_64/sgemv_n_4.c | 56 ++++- kernel/x86_64/sgemv_n_microk_skylakex-8.c | 258 ++++++++++++++++++++++ 2 files changed, 304 insertions(+), 10 deletions(-) create mode 100644 kernel/x86_64/sgemv_n_microk_skylakex-8.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 3eec21774..81d495eae 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,8 +35,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) #include "sgemv_n_microk_haswell-4.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#include "sgemv_n_microk_haswell-4.c" +#include "sgemv_n_microk_skylakex-8.c" +#endif + #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) @@ -291,6 +296,41 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + if ( m < 1 || n < 1) return(0); + + #ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL + if (m <= 16384 && n <= 48 && !(n == 4)) + { + FLOAT * xbuffer_align = x; + FLOAT * ybuffer_align = y; + + FLOAT * xbuffer = NULL; + FLOAT * ybuffer = NULL; + + if (inc_x != 1) { + xbuffer_align = buffer; + for(BLASLONG i=0; i= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_SGEMV_N_SKYLAKE_KERNEL 1 +#include "common.h" +#include +static int sgemv_kernel_n_128(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y) +{ + __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7; + __m512 xArray_0; + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); + BLASLONG tag_m_128x = m & (~127); + BLASLONG tag_m_64x = m & (~63); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]); + matrixArray_4 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 64]); + matrixArray_5 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 80]); + matrixArray_6 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 96]); + matrixArray_7 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 112]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2); + accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3); + accum512_4 = _mm512_fmadd_ps(matrixArray_4, xArray_0, accum512_4); + accum512_5 = _mm512_fmadd_ps(matrixArray_5, xArray_0, accum512_5); + accum512_6 = _mm512_fmadd_ps(matrixArray_6, xArray_0, accum512_6); + accum512_7 = _mm512_fmadd_ps(matrixArray_7, xArray_0, accum512_7); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(accum512_4, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 64]))); + _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(accum512_5, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 80]))); + _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(accum512_6, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 96]))); + _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(accum512_7, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 112]))); + } + if (tag_m_128x != m) { + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2); + accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + } + + if(tag_m_64x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + } + + if(tag_m_32x != m) { + + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + } + + if (tag_m_16x != m) { + accum512_0 = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + for(BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_16x]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + } + + _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x]))); + + } + } + } + } + return 0; +} + +static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y) +{ + __m256 ma0, ma1, ma2, ma3, ma4, ma5, ma6, ma7; + __m256 as0, as1, as2, as3, as4, as5, as6, as7; + __m256 alphav = _mm256_set1_ps(alpha); + __m256 xv; + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + __mmask8 one_mask = 0xff; + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + as0 = _mm256_setzero_ps(); + as1 = _mm256_setzero_ps(); + as2 = _mm256_setzero_ps(); + as3 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma0 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]); + ma1 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]); + ma2 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +16]); + ma3 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +24]); + + as0 = _mm256_maskz_fmadd_ps(one_mask, ma0, xv, as0); + as1 = _mm256_maskz_fmadd_ps(one_mask, ma1, xv, as1); + as2 = _mm256_maskz_fmadd_ps(one_mask, ma2, xv, as2); + as3 = _mm256_maskz_fmadd_ps(one_mask, ma3, xv, as3); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as0, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as1, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8]))); + _mm256_mask_storeu_ps(&y[idx_m + 16], one_mask, _mm256_maskz_fmadd_ps(one_mask, as2, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 16]))); + _mm256_mask_storeu_ps(&y[idx_m + 24], one_mask, _mm256_maskz_fmadd_ps(one_mask, as3, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 24]))); + + } + + if (tag_m_32x != m ) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + as4 = _mm256_setzero_ps(); + as5 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma4 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]); + ma5 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]); + + as4 = _mm256_maskz_fmadd_ps(one_mask, ma4, xv, as4); + as5 = _mm256_maskz_fmadd_ps(one_mask, ma5, xv, as5); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as4, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as5, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8]))); + } + + if (tag_m_16x != m ) { + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + as6 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma6 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m]); + as6 = _mm256_maskz_fmadd_ps(one_mask, ma6, xv, as6); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as6, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + } + + if (tag_m_8x != m) { + as7 = _mm256_setzero_ps(); + + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(m&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + + for(BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma7 = _mm256_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_8x]); + + as7 = _mm256_maskz_fmadd_ps(tail_mask, ma7, xv, as7); + } + + _mm256_mask_storeu_ps(&y[tag_m_8x], tail_mask, _mm256_maskz_fmadd_ps(tail_mask, as7, alphav, _mm256_maskz_loadu_ps(tail_mask, &y[tag_m_8x]))); + + } + } + } + + return 0; +} + + +#endif \ No newline at end of file From 3d4ccd2a130447eb7e0b8f5326dcd6e856fb8de9 Mon Sep 17 00:00:00 2001 From: pnp Date: Fri, 30 Apr 2021 12:25:33 -0400 Subject: [PATCH 004/108] fix for build error --- kernel/x86_64/sgemv_n_4.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 81d495eae..bc006bf3c 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -42,8 +42,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_skylakex-8.c" #endif -#endif - #if defined(STEAMROLLER) || defined(EXCAVATOR) #define NBMAX 2048 #else From 380f955078eee43d729453f011388ce51e5dc675 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 May 2021 00:00:29 +0200 Subject: [PATCH 005/108] Update version to 0.3.15.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 15f6ba2c2..0863163c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 15) +set(OpenBLAS_PATCH_VERSION 15.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 9721b57ecfd194f1a4aaa08d715735cd9e8ad8b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 May 2021 00:01:08 +0200 Subject: [PATCH 006/108] Update version to 0.3.15.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 0c138331e..64c8ff778 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.15 +VERSION = 0.3.15.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 8b599836db17451bf28e3ad74b0e26474af0c1b4 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 4 May 2021 13:55:02 -0500 Subject: [PATCH 007/108] Add error message token for SBGEMM in gemm.c --- interface/gemm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/interface/gemm.c b/interface/gemm.c index 6fde69049..cd5d00589 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -49,6 +49,8 @@ #define ERROR_NAME "QGEMM " #elif defined(DOUBLE) #define ERROR_NAME "DGEMM " +#elif defined(BFLOAT16) +#define ERROR_NAME "SBGEMM " #else #define ERROR_NAME "SGEMM " #endif From 206e03fdaca9f412e8a48963816f3a46e13d45b2 Mon Sep 17 00:00:00 2001 From: drhpc Date: Tue, 4 May 2021 21:02:07 +0200 Subject: [PATCH 008/108] Delete lapack_wrappers.c.orig This looks like a leftover from patching and confuses further patching;-) --- relapack/src/lapack_wrappers.c.orig | 607 ---------------------------- 1 file changed, 607 deletions(-) delete mode 100644 relapack/src/lapack_wrappers.c.orig diff --git a/relapack/src/lapack_wrappers.c.orig b/relapack/src/lapack_wrappers.c.orig deleted file mode 100644 index d89d2fe2f..000000000 --- a/relapack/src/lapack_wrappers.c.orig +++ /dev/null @@ -1,607 +0,0 @@ -#include "relapack.h" - -//////////// -// XLAUUM // -//////////// - -#if INCLUDE_SLAUUM -void LAPACK(slauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_slauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DLAUUM -void LAPACK(dlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dlauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CLAUUM -void LAPACK(clauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_clauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZLAUUM -void LAPACK(zlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_zlauum(uplo, n, A, ldA, info); -} -#endif - - -//////////// -// XSYGST // -//////////// - -#if INCLUDE_SSYGST -void LAPACK(ssygst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info -) { - RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_DSYGST -void LAPACK(dsygst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info -) { - RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_CSYGST -void LAPACK(csygst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info -) { - RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_ZSYGST -void LAPACK(zsygst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info -) { - RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - - -//////////// -// XTRTRI // -//////////// - -#if INCLUDE_STRTRI -void LAPACK(strtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_strtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_DTRTRI -void LAPACK(dtrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_CTRTRI -void LAPACK(ctrtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZTRTRI -void LAPACK(ztrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); -} -#endif - - -//////////// -// XPOTRF // -//////////// - -#if INCLUDE_SPOTRF -void LAPACK(spotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_spotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DPOTRF -void LAPACK(dpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dpotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CPOTRF -void LAPACK(cpotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_cpotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZPOTRF -void LAPACK(zpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_zpotrf(uplo, n, A, ldA, info); -} -#endif - - -//////////// -// XPBTRF // -//////////// - -#if INCLUDE_SPBTRF -void LAPACK(spbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info -) { - RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_DPBTRF -void LAPACK(dpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info -) { - RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_CPBTRF -void LAPACK(cpbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info -) { - RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_ZPBTRF -void LAPACK(zpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info -) { - RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - - -//////////// -// XSYTRF // -//////////// - -#if INCLUDE_SSYTRF -void LAPACK(ssytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_DSYTRF -void LAPACK(dsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CSYTRF -void LAPACK(csytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZSYTRF -void LAPACK(zsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CHETRF -void LAPACK(chetrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZHETRF -void LAPACK(zhetrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_SSYTRF_ROOK -void LAPACK(ssytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_DSYTRF_ROOK -void LAPACK(dsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CSYTRF_ROOK -void LAPACK(csytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZSYTRF_ROOK -void LAPACK(zsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CHETRF_ROOK -void LAPACK(chetrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZHETRF_ROOK -void LAPACK(zhetrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - - -//////////// -// XGETRF // -//////////// - -#if INCLUDE_SGETRF -void LAPACK(sgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_DGETRF -void LAPACK(dgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_CGETRF -void LAPACK(cgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_ZGETRF -void LAPACK(zgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); -} -#endif - - -//////////// -// XGBTRF // -//////////// - -#if INCLUDE_SGBTRF -void LAPACK(sgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_DGBTRF -void LAPACK(dgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_CGBTRF -void LAPACK(cgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_ZGBTRF -void LAPACK(zgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - - -//////////// -// XTRSYL // -//////////// - -#if INCLUDE_STRSYL -void LAPACK(strsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info -) { - RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_DTRSYL -void LAPACK(dtrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info -) { - RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_CTRSYL -void LAPACK(ctrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info -) { - RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_ZTRSYL -void LAPACK(ztrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info -) { - RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - - -//////////// -// XTGSYL // -//////////// - -#if INCLUDE_STGSYL -void LAPACK(stgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, - float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_DTGSYL -void LAPACK(dtgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, - double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_CTGSYL -void LAPACK(ctgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, - float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_ZTGSYL -void LAPACK(ztgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, - double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - - -//////////// -// XGEMMT // -//////////// - -#if INCLUDE_SGEMMT -void LAPACK(sgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC -) { - RELAPACK_sgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DGEMMT -void LAPACK(dgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC -) { - RELAPACK_dgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CGEMMT -void LAPACK(cgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC -) { - RELAPACK_cgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZGEMMT -void LAPACK(zgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC -) { - RELAPACK_zgemmt(uplo, n, A, ldA, info); -} -#endif From c0ca63ea4672c3b013136ef54a69e5ab967be270 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 5 May 2021 14:55:36 +0200 Subject: [PATCH 009/108] Fix missing conditionals for non-SKX kernels --- kernel/x86_64/sgemv_n_4.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index bc006bf3c..06de28d97 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -417,7 +417,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 2 ) { +#ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL sgemv_kernel_n_64(NB, 2, alpha, a_ptr, lda, x_ptr, ybuffer); +#else + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); +#endif a_ptr += lda*2; x_ptr += 2; } @@ -425,7 +429,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { +#ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL sgemv_kernel_n_64(NB, 1, alpha, a_ptr, lda, x_ptr, ybuffer); +#else + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); +#endif /* a_ptr += lda; x_ptr += 1a; */ From bda8820da73193d4115016c571f7898d53047f7a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 May 2021 20:20:08 +0200 Subject: [PATCH 010/108] Use percent instead of ampersand as placeholder for substitutions --- f_check | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/f_check b/f_check index 2c0d7fcb9..4825fb09a 100644 --- a/f_check +++ b/f_check @@ -314,11 +314,11 @@ if ($link ne "") { $link =~ s/\-Y\sP\,/\-Y/g; - $link =~ s/\-R\s*/\-rpath\@/g; + $link =~ s/\-R\s*/\-rpath\%/g; - $link =~ s/\-rpath\s+/\-rpath\@/g; + $link =~ s/\-rpath\s+/\-rpath\%/g; - $link =~ s/\-rpath-link\s+/\-rpath-link\@/g; + $link =~ s/\-rpath-link\s+/\-rpath-link\%/g; @flags = split(/[\s\,\n]/, $link); # remove leading and trailing quotes from each flag. @@ -344,13 +344,13 @@ if ($link ne "") { } - if ($flags =~ /^\-rpath\@/) { - $flags =~ s/\@/\,/g; + if ($flags =~ /^\-rpath\%/) { + $flags =~ s/\%/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /^\-rpath-link\@/) { - $flags =~ s/\@/\,/g; + if ($flags =~ /^\-rpath-link\%/) { + $flags =~ s/\%/\,/g; $linker_L .= "-Wl,". $flags . " " ; } if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { From ec7d6c02bcdbd8d0f2986136a21f79f70417efe0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 May 2021 08:02:01 +0200 Subject: [PATCH 011/108] Add an Android crossbuild on OSX to Azure CI (#3224) * Add an Android crossbuild on OSX --- azure-pipelines.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 56a3fd4ae..4b6b2b0e6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -137,3 +137,13 @@ jobs: source /opt/intel/oneapi/setvars.sh make CC=/usr/local/opt/llvm/bin/clang FC=ifort +- job: OSX_NDK_ARMV7 + pool: + vmImage: 'macOS-10.15' + steps: + - script: | + brew update + brew install --cask android-ndk + export ANDROID_NDK_HOME=/usr/local/share/android-ndk + make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 + From bd60fb6ffc9d14834ed03bed0f7e6e44126c6c05 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 May 2021 23:05:00 +0200 Subject: [PATCH 012/108] filter out -mavx flag on zgemm kernels as it can cause problems with older gcc --- kernel/Makefile.L3 | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index d8d739965..be10ee018 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -819,7 +819,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -829,7 +829,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -839,7 +839,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -849,7 +849,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ endif $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) @@ -1045,7 +1045,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1055,7 +1055,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1065,7 +1065,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1075,7 +1075,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1085,7 +1085,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1095,7 +1095,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1105,7 +1105,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1115,7 +1115,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif else @@ -1187,28 +1187,28 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif From 8b90e5f2029f21eecbcf961164516cd69da16e98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 15:06:44 +0200 Subject: [PATCH 013/108] Drop redundant inclusion of complex.h --- kernel/x86_64/cdot.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index f2bf19dcd..654cd351a 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -27,7 +27,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#include #if defined(BULLDOZER) From 73f637e5848ae19b90f522222f03df875f21468f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 15:08:12 +0200 Subject: [PATCH 014/108] Support compilation with pre-C99 versions of MSVC --- utest/ctest.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/utest/ctest.h b/utest/ctest.h index d316b1494..037f7f28d 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -28,7 +28,10 @@ #define WEAK #endif +#ifndef __MSC_VER #include /* intmax_t, uintmax_t, PRI* */ +#endif + #include /* size_t */ typedef void (*SetupFunc)(void*); @@ -72,6 +75,13 @@ struct ctest { #define __CTEST_NO_TIME #define CTEST_NO_COLORS +#if __MSC_VER >= 1500 +#include +#else +#include +#define CTEST_NO_INTTYPES +#endif + #ifndef CTEST_ADD_TESTS_MANUALLY #pragma section(".ctest$a") #pragma section(".ctest$u") @@ -480,11 +490,19 @@ void assert_data(const unsigned char* exp, size_t expsize, const char* caller, int line) { size_t i; if (expsize != realsize) { +#ifndef CTEST_NO_INTTYPES CTEST_ERR("%s:%d expected %" PRIuMAX " bytes, got %" PRIuMAX, caller, line, (uintmax_t) expsize, (uintmax_t) realsize); +#else + CTEST_ERR("%s:%d expected %u bytes, got %u", caller, line, (uintmax_t) expsize, (uintmax_t) realsize); +#endif } for (i=0; i exp2) { +#ifndef CTEST_NO_INTTYPES CTEST_ERR("%s:%d expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real); +#else + CTEST_ERR("%s:%d expected %d-%d, got %d", caller, line, exp1, exp2, real); +#endif } } From eef1c42f03693da6d4f5be91865500fef6803dcf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 19:53:03 +0200 Subject: [PATCH 015/108] Convert ?chkaa to use dynamic allocation for the larger arrays --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 8 +- lapack-netlib/TESTING/LIN/Makefile | 8 +- lapack-netlib/TESTING/LIN/cchkaa.F | 1237 +++++++++++++++++++++ lapack-netlib/TESTING/LIN/dchkaa.F | 1080 ++++++++++++++++++ lapack-netlib/TESTING/LIN/schkaa.F | 1074 ++++++++++++++++++ lapack-netlib/TESTING/LIN/zchkaa.F | 1271 ++++++++++++++++++++++ 6 files changed, 4670 insertions(+), 8 deletions(-) create mode 100644 lapack-netlib/TESTING/LIN/cchkaa.F create mode 100644 lapack-netlib/TESTING/LIN/dchkaa.F create mode 100644 lapack-netlib/TESTING/LIN/schkaa.F create mode 100644 lapack-netlib/TESTING/LIN/zchkaa.F diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 309ed7e77..fc55b8a96 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -6,7 +6,7 @@ set(SCLNTST slaord.f) set(DZLNTST dlaord.f) -set(SLINTST schkaa.f +set(SLINTST schkaa.F schkeq.f schkgb.f schkge.f schkgt.f schklq.f schkpb.f schkpo.f schkps.f schkpp.f schkpt.f schkq3.f schkql.f schkqr.f schkrq.f @@ -51,7 +51,7 @@ else() serrvx.f serrge.f serrsy.f serrpo.f) endif() -set(CLINTST cchkaa.f +set(CLINTST cchkaa.F cchkeq.f cchkgb.f cchkge.f cchkgt.f cchkhe.f cchkhe_rook.f cchkhe_rk.f cchkhe_aa.f cchkhe_aa_2stage.f @@ -107,7 +107,7 @@ else() cerrvx.f cerrge.f cerrhe.f cerrsy.f cerrpo.f) endif() -set(DLINTST dchkaa.f +set(DLINTST dchkaa.F dchkeq.f dchkgb.f dchkge.f dchkgt.f dchklq.f dchkpb.f dchkpo.f dchkps.f dchkpp.f dchkpt.f dchkq3.f dchkql.f dchkqr.f dchkrq.f @@ -153,7 +153,7 @@ else() derrvx.f derrge.f derrsy.f derrpo.f) endif() -set(ZLINTST zchkaa.f +set(ZLINTST zchkaa.F zchkeq.f zchkgb.f zchkge.f zchkgt.f zchkhe.f zchkhe_rook.f zchkhe_rk.f zchkhe_aa.f zchkhe_aa_2stage.f diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 674265816..54b26455e 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -317,13 +317,13 @@ cleanobj: cleanexe: rm -f xlintst* -schkaa.o: schkaa.f +schkaa.o: schkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -dchkaa.o: dchkaa.f +dchkaa.o: dchkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -cchkaa.o: cchkaa.f +cchkaa.o: cchkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -zchkaa.o: zchkaa.f +zchkaa.o: zchkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< .NOTPARALLEL: diff --git a/lapack-netlib/TESTING/LIN/cchkaa.F b/lapack-netlib/TESTING/LIN/cchkaa.F new file mode 100644 index 000000000..ec1534ed4 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cchkaa.F @@ -0,0 +1,1237 @@ +*> \brief \b CCHKAA +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CCHKAA +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CCHKAA is the main test program for the COMPLEX linear equation +*> routines. +*> +*> The program must be driven by a short data file. The first 15 records +*> (not including the first comment line) specify problem dimensions +*> and program options using list-directed input. The remaining lines +*> specify the LAPACK test paths and the number of matrix types to use +*> in testing. An annotated example of a data file can be obtained by +*> deleting the first 3 characters from the following 42 lines: +*> Data file for testing COMPLEX LAPACK linear equation routines +*> 7 Number of values of M +*> 0 1 2 3 5 10 16 Values of M (row dimension) +*> 7 Number of values of N +*> 0 1 2 3 5 10 16 Values of N (column dimension) +*> 1 Number of values of NRHS +*> 2 Values of NRHS (number of right hand sides) +*> 5 Number of values of NB +*> 1 3 3 3 20 Values of NB (the blocksize) +*> 1 0 5 9 1 Values of NX (crossover point) +*> 3 Number of values of RANK +*> 30 50 90 Values of rank (as a % of N) +*> 30.0 Threshold value of test ratio +*> T Put T to test the LAPACK routines +*> T Put T to test the driver routines +*> T Put T to test the error exits +*> CGE 11 List types on next line if 0 < NTYPES < 11 +*> CGB 8 List types on next line if 0 < NTYPES < 8 +*> CGT 12 List types on next line if 0 < NTYPES < 12 +*> CPO 9 List types on next line if 0 < NTYPES < 9 +*> CPO 9 List types on next line if 0 < NTYPES < 9 +*> CPP 9 List types on next line if 0 < NTYPES < 9 +*> CPB 8 List types on next line if 0 < NTYPES < 8 +*> CPT 12 List types on next line if 0 < NTYPES < 12 +*> CHE 10 List types on next line if 0 < NTYPES < 10 +*> CHR 10 List types on next line if 0 < NTYPES < 10 +*> CHK 10 List types on next line if 0 < NTYPES < 10 +*> CHA 10 List types on next line if 0 < NTYPES < 10 +*> CH2 10 List types on next line if 0 < NTYPES < 10 +*> CSA 11 List types on next line if 0 < NTYPES < 10 +*> CS2 11 List types on next line if 0 < NTYPES < 10 +*> CHP 10 List types on next line if 0 < NTYPES < 10 +*> CSY 11 List types on next line if 0 < NTYPES < 11 +*> CSK 11 List types on next line if 0 < NTYPES < 11 +*> CSR 11 List types on next line if 0 < NTYPES < 11 +*> CSP 11 List types on next line if 0 < NTYPES < 11 +*> CTR 18 List types on next line if 0 < NTYPES < 18 +*> CTP 18 List types on next line if 0 < NTYPES < 18 +*> CTB 17 List types on next line if 0 < NTYPES < 17 +*> CQR 8 List types on next line if 0 < NTYPES < 8 +*> CRQ 8 List types on next line if 0 < NTYPES < 8 +*> CLQ 8 List types on next line if 0 < NTYPES < 8 +*> CQL 8 List types on next line if 0 < NTYPES < 8 +*> CQP 6 List types on next line if 0 < NTYPES < 6 +*> CTZ 3 List types on next line if 0 < NTYPES < 3 +*> CLS 6 List types on next line if 0 < NTYPES < 6 +*> CEQ +*> CQT +*> CQX +*> CTS +*> CHH +*> \endverbatim +* +* Parameters: +* ========== +* +*> \verbatim +*> NMAX INTEGER +*> The maximum allowable value for M and N. +*> +*> MAXIN INTEGER +*> The number of different values that can be used for each of +*> M, N, NRHS, NB, NX and RANK +*> +*> MAXRHS INTEGER +*> The maximum number of right hand sides +*> +*> MATMAX INTEGER +*> The maximum number of matrix types to use for testing +*> +*> NIN INTEGER +*> The unit number for input +*> +*> NOUT INTEGER +*> The unit number for output +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex_lin +* +* ===================================================================== + PROGRAM CCHKAA +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER MAXIN + PARAMETER ( MAXIN = 12 ) + INTEGER MAXRHS + PARAMETER ( MAXRHS = 16 ) + INTEGER MATMAX + PARAMETER ( MATMAX = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER KDMAX + PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) +* .. +* .. Local Scalars .. + LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*2 C2 + CHARACTER*3 PATH + CHARACTER*10 INTSTR + CHARACTER*72 ALINE + INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, + $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + REAL EPS, S1, S2, THREQ, THRESH +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MATMAX ) + INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), + $ NBVAL( MAXIN ), NBVAL2( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ RANKVAL( MAXIN ), PIV( NMAX ) + REAL S( 2*NMAX ) + COMPLEX E( NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK + COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK +* .. +* .. External Functions .. + LOGICAL LSAME, LSAMEN + REAL SECOND, SLAMCH + EXTERNAL LSAME, LSAMEN, SECOND, SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE, + $ CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP, + $ CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS, + $ CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ, + $ CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK, + $ CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, CCHKTZ, + $ CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK, + $ CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB, + $ CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY, + $ CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER, + $ CCHKQRT, CCHKQRTP +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Arrays in Common .. + INTEGER IPARMS( 100 ) +* .. +* .. Common blocks .. + COMMON / CLAENV / IPARMS + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA THREQ / 2.0 / , INTSTR / '0123456789' / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. +* .. Executable Statements .. +* + S1 = SECOND( ) + LDA = NMAX + FATAL = .FALSE. +* +* Read a dummy line. +* + READ( NIN, FMT = * ) +* +* Report values of parameters. +* + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH +* +* Read the values of M +* + READ( NIN, FMT = * )NM + IF( NM.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 + NM = 0 + FATAL = .TRUE. + ELSE IF( NM.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN + NM = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) + DO 10 I = 1, NM + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 10 CONTINUE + IF( NM.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) +* +* Read the values of N +* + READ( NIN, FMT = * )NN + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 20 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + IF( NN.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) +* +* Read the values of NRHS +* + READ( NIN, FMT = * )NNS + IF( NNS.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 + NNS = 0 + FATAL = .TRUE. + ELSE IF( NNS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN + NNS = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) + DO 30 I = 1, NNS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN + WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS + FATAL = .TRUE. + END IF + 30 CONTINUE + IF( NNS.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) +* +* Read the values of NB +* + READ( NIN, FMT = * )NNB + IF( NNB.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 + NNB = 0 + FATAL = .TRUE. + ELSE IF( NNB.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN + NNB = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) + DO 40 I = 1, NNB + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + END IF + 40 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) +* +* Set NBVAL2 to be the set of unique values of NB +* + NNB2 = 0 + DO 60 I = 1, NNB + NB = NBVAL( I ) + DO 50 J = 1, NNB2 + IF( NB.EQ.NBVAL2( J ) ) + $ GO TO 60 + 50 CONTINUE + NNB2 = NNB2 + 1 + NBVAL2( NNB2 ) = NB + 60 CONTINUE +* +* Read the values of NX +* + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) + DO 70 I = 1, NNB + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + END IF + 70 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) +* +* Read the values of RANKVAL +* + READ( NIN, FMT = * )NRANK + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 + NRANK = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN + NRANK = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) + DO I = 1, NRANK + IF( RANKVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( RANKVAL( I ).GT.100 ) THEN + WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 + FATAL = .TRUE. + END IF + END DO + IF( NRANK.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', + $ ( RANKVAL( I ), I = 1, NRANK ) +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9992 )THRESH +* +* Read the flag that indicates whether to test the LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test the driver routines. +* + READ( NIN, FMT = * )TSTDRV +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Calculate and print the machine dependent constants. +* + EPS = SLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9991 )'underflow', EPS + EPS = SLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9991 )'overflow ', EPS + EPS = SLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9991 )'precision', EPS + WRITE( NOUT, FMT = * ) + NRHS = NSVAL( 1 ) +* + 80 CONTINUE +* +* Read a test path and the number of matrix types to use. +* + READ( NIN, FMT = '(A72)', END = 140 )ALINE + PATH = ALINE( 1: 3 ) + NMATS = MATMAX + I = 3 + 90 CONTINUE + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + IF( ALINE( I: I ).EQ.' ' ) + $ GO TO 90 + NMATS = 0 + 100 CONTINUE + C1 = ALINE( I: I ) + DO 110 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 120 + END IF + 110 CONTINUE + GO TO 130 + 120 CONTINUE + NMATS = NMATS*10 + IC + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + GO TO 100 + 130 CONTINUE + C1 = PATH( 1: 1 ) + C2 = PATH( 2: 3 ) +* +* Check first character for correct precision. +* + IF( .NOT.LSAME( C1, 'Complex precision' ) ) THEN + WRITE( NOUT, FMT = 9990 )PATH +* + ELSE IF( NMATS.LE.0 ) THEN +* +* Check for a positive number of tests requested. +* + WRITE( NOUT, FMT = 9989 )PATH +* + ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN +* +* GE: general matrices +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN +* +* GB: general banded matrices +* + LA = ( 2*KDMAX+1 )*NMAX + LAFAC = ( 3*KDMAX+1 )*NMAX + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, + $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN +* +* GT: general tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN +* +* PO: positive definite matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN +* +* PS: positive semi-definite matrices +* + NTYPES = 9 +* + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, + $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN +* +* PP: positive definite packed matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN +* +* PB: positive definite banded matrices +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN +* +* PT: positive definite tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HE' ) ) THEN +* +* HE: Hermitian indefinite matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HR' ) ) THEN +* +* HR: Hermitian indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHE_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHE_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HK' ) ) THEN +* +* HK: Hermitian indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than HR path version. +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHE_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHE_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN +* +* HA: Hermitian matrices, +* Aasen Algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHE_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHE_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN +* +* H2: Hermitian matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, + $ NNS, NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHE_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN +* +* HP: Hermitian indefinite packed matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN +* +* SY: symmetric indefinite matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN +* +* SR: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN +* +* SK: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than SR path version. +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* SA: symmetric indefinite matrices with Aasen's algorithm, +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN +* +* S2: symmetric indefinite matrices with Aasen's algorithm +* 2 stage +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSY_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN +* +* SP: symmetric indefinite packed matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN +* +* TR: triangular matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN +* +* TP: triangular packed matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN +* +* TB: triangular banded matrices +* + NTYPES = 17 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN +* +* QR: QR factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN +* +* LQ: LQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN +* +* QL: QL factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN +* +* RQ: RQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN +* +* EQ: Equilibration routines for general and positive definite +* matrices (THREQ should be between 2 and 10) +* + IF( TSTCHK ) THEN + CALL CCHKEQ( THREQ, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN +* +* TZ: Trapezoidal matrix +* + NTYPES = 3 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), S( 1 ), + $ B( 1, 1 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN +* +* QP: QR factorization with pivoting +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ THRESH, A( 1, 1 ), A( 1, 2 ), S( 1 ), + $ B( 1, 1 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN +* +* LS: Least squares drivers +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTDRV ) THEN + CALL CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, + $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ S( 1 ), S( NMAX+1 ), NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN +* +* QT: QRT routines for general matrices +* + IF( TSTCHK ) THEN + CALL CCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN +* +* QX: QRT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL CCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN +* +* TQ: LQT routines for general matrices +* + IF( TSTCHK ) THEN + CALL CCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN +* +* XQ: LQT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL CCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN +* +* TS: QR routines for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL CCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF +* + ELSE +* + WRITE( NOUT, FMT = 9990 )PATH + END IF +* +* Go back to get another input line. +* + GO TO 80 +* +* Branch to this line when the last record is read. +* + 140 CONTINUE + CLOSE ( NIN ) + S2 = SECOND( ) + WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9998 FORMAT( / ' End of tests' ) + 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', + $ I6 ) + 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', + $ I6 ) + 9994 FORMAT( ' Tests of the COMPLEX LAPACK routines ', + $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, + $ / / ' The following parameter values will be used:' ) + 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) + 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9991 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) + 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) + 9989 FORMAT( / 1X, A3, ' routines were not tested' ) + 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) +* +* End of CCHKAA +* + END diff --git a/lapack-netlib/TESTING/LIN/dchkaa.F b/lapack-netlib/TESTING/LIN/dchkaa.F new file mode 100644 index 000000000..ef9d7808c --- /dev/null +++ b/lapack-netlib/TESTING/LIN/dchkaa.F @@ -0,0 +1,1080 @@ +*> \brief \b DCHKAA +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM DCHKAA +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DCHKAA is the main test program for the DOUBLE PRECISION LAPACK +*> linear equation routines +*> +*> The program must be driven by a short data file. The first 15 records +*> (not including the first comment line) specify problem dimensions +*> and program options using list-directed input. The remaining lines +*> specify the LAPACK test paths and the number of matrix types to use +*> in testing. An annotated example of a data file can be obtained by +*> deleting the first 3 characters from the following 40 lines: +*> Data file for testing DOUBLE PRECISION LAPACK linear eqn. routines +*> 7 Number of values of M +*> 0 1 2 3 5 10 16 Values of M (row dimension) +*> 7 Number of values of N +*> 0 1 2 3 5 10 16 Values of N (column dimension) +*> 1 Number of values of NRHS +*> 2 Values of NRHS (number of right hand sides) +*> 5 Number of values of NB +*> 1 3 3 3 20 Values of NB (the blocksize) +*> 1 0 5 9 1 Values of NX (crossover point) +*> 3 Number of values of RANK +*> 30 50 90 Values of rank (as a % of N) +*> 20.0 Threshold value of test ratio +*> T Put T to test the LAPACK routines +*> T Put T to test the driver routines +*> T Put T to test the error exits +*> DGE 11 List types on next line if 0 < NTYPES < 11 +*> DGB 8 List types on next line if 0 < NTYPES < 8 +*> DGT 12 List types on next line if 0 < NTYPES < 12 +*> DPO 9 List types on next line if 0 < NTYPES < 9 +*> DPS 9 List types on next line if 0 < NTYPES < 9 +*> DPP 9 List types on next line if 0 < NTYPES < 9 +*> DPB 8 List types on next line if 0 < NTYPES < 8 +*> DPT 12 List types on next line if 0 < NTYPES < 12 +*> DSY 10 List types on next line if 0 < NTYPES < 10 +*> DSR 10 List types on next line if 0 < NTYPES < 10 +*> DSK 10 List types on next line if 0 < NTYPES < 10 +*> DSA 10 List types on next line if 0 < NTYPES < 10 +*> DS2 10 List types on next line if 0 < NTYPES < 10 +*> DSP 10 List types on next line if 0 < NTYPES < 10 +*> DTR 18 List types on next line if 0 < NTYPES < 18 +*> DTP 18 List types on next line if 0 < NTYPES < 18 +*> DTB 17 List types on next line if 0 < NTYPES < 17 +*> DQR 8 List types on next line if 0 < NTYPES < 8 +*> DRQ 8 List types on next line if 0 < NTYPES < 8 +*> DLQ 8 List types on next line if 0 < NTYPES < 8 +*> DQL 8 List types on next line if 0 < NTYPES < 8 +*> DQP 6 List types on next line if 0 < NTYPES < 6 +*> DTZ 3 List types on next line if 0 < NTYPES < 3 +*> DLS 6 List types on next line if 0 < NTYPES < 6 +*> DEQ +*> DQT +*> DQX +*> DTQ +*> DXQ +*> DTS +*> DHH +*> \endverbatim +* +* Parameters: +* ========== +* +*> \verbatim +*> NMAX INTEGER +*> The maximum allowable value for M and N. +*> +*> MAXIN INTEGER +*> The number of different values that can be used for each of +*> M, N, NRHS, NB, NX and RANK +*> +*> MAXRHS INTEGER +*> The maximum number of right hand sides +*> +*> MATMAX INTEGER +*> The maximum number of matrix types to use for testing +*> +*> NIN INTEGER +*> The unit number for input +*> +*> NOUT INTEGER +*> The unit number for output +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup double_lin +* +* ===================================================================== + PROGRAM DCHKAA +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER MAXIN + PARAMETER ( MAXIN = 12 ) + INTEGER MAXRHS + PARAMETER ( MAXRHS = 16 ) + INTEGER MATMAX + PARAMETER ( MATMAX = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER KDMAX + PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) +* .. +* .. Local Scalars .. + LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*2 C2 + CHARACTER*3 PATH + CHARACTER*10 INTSTR + CHARACTER*72 ALINE + INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, + $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + DOUBLE PRECISION EPS, S1, S2, THREQ, THRESH +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MATMAX ) + INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), + $ NBVAL( MAXIN ), NBVAL2( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ RANKVAL( MAXIN ), PIV( NMAX ) + DOUBLE PRECISION E( NMAX ), S( 2*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK +* .. +* .. External Functions .. + LOGICAL LSAME, LSAMEN + DOUBLE PRECISION DLAMCH, DSECND + EXTERNAL LSAME, LSAMEN, DLAMCH, DSECND +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ, + $ DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP, + $ DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP, + $ DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA, + $ DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE, + $ DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT, + $ DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK, + $ DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP, + $ DCHKLQT,DCHKTSQR +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Arrays in Common .. + INTEGER IPARMS( 100 ) +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT + COMMON / CLAENV / IPARMS +* .. +* .. Data statements .. + DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / +* .. +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* +* .. Executable Statements .. +* + S1 = DSECND( ) + LDA = NMAX + FATAL = .FALSE. +* +* Read a dummy line. +* + READ( NIN, FMT = * ) +* +* Report values of parameters. +* + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH +* +* Read the values of M +* + READ( NIN, FMT = * )NM + IF( NM.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 + NM = 0 + FATAL = .TRUE. + ELSE IF( NM.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN + NM = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) + DO 10 I = 1, NM + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 10 CONTINUE + IF( NM.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) +* +* Read the values of N +* + READ( NIN, FMT = * )NN + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 20 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + IF( NN.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) +* +* Read the values of NRHS +* + READ( NIN, FMT = * )NNS + IF( NNS.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 + NNS = 0 + FATAL = .TRUE. + ELSE IF( NNS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN + NNS = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) + DO 30 I = 1, NNS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN + WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS + FATAL = .TRUE. + END IF + 30 CONTINUE + IF( NNS.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) +* +* Read the values of NB +* + READ( NIN, FMT = * )NNB + IF( NNB.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 + NNB = 0 + FATAL = .TRUE. + ELSE IF( NNB.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN + NNB = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) + DO 40 I = 1, NNB + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + END IF + 40 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) +* +* Set NBVAL2 to be the set of unique values of NB +* + NNB2 = 0 + DO 60 I = 1, NNB + NB = NBVAL( I ) + DO 50 J = 1, NNB2 + IF( NB.EQ.NBVAL2( J ) ) + $ GO TO 60 + 50 CONTINUE + NNB2 = NNB2 + 1 + NBVAL2( NNB2 ) = NB + 60 CONTINUE +* +* Read the values of NX +* + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) + DO 70 I = 1, NNB + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + END IF + 70 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) +* +* Read the values of RANKVAL +* + READ( NIN, FMT = * )NRANK + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 + NRANK = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN + NRANK = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) + DO I = 1, NRANK + IF( RANKVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( RANKVAL( I ).GT.100 ) THEN + WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 + FATAL = .TRUE. + END IF + END DO + IF( NRANK.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', + $ ( RANKVAL( I ), I = 1, NRANK ) +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9992 )THRESH +* +* Read the flag that indicates whether to test the LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test the driver routines. +* + READ( NIN, FMT = * )TSTDRV +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Calculate and print the machine dependent constants. +* + EPS = DLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9991 )'underflow', EPS + EPS = DLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9991 )'overflow ', EPS + EPS = DLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9991 )'precision', EPS + WRITE( NOUT, FMT = * ) +* + 80 CONTINUE +* +* Read a test path and the number of matrix types to use. +* + READ( NIN, FMT = '(A72)', END = 140 )ALINE + PATH = ALINE( 1: 3 ) + NMATS = MATMAX + I = 3 + 90 CONTINUE + I = I + 1 + IF( I.GT.72 ) THEN + NMATS = MATMAX + GO TO 130 + END IF + IF( ALINE( I: I ).EQ.' ' ) + $ GO TO 90 + NMATS = 0 + 100 CONTINUE + C1 = ALINE( I: I ) + DO 110 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 120 + END IF + 110 CONTINUE + GO TO 130 + 120 CONTINUE + NMATS = NMATS*10 + IC + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + GO TO 100 + 130 CONTINUE + C1 = PATH( 1: 1 ) + C2 = PATH( 2: 3 ) + NRHS = NSVAL( 1 ) +* +* Check first character for correct precision. +* + IF( .NOT.LSAME( C1, 'Double precision' ) ) THEN + WRITE( NOUT, FMT = 9990 )PATH +* + ELSE IF( NMATS.LE.0 ) THEN +* +* Check for a positive number of tests requested. +* + WRITE( NOUT, FMT = 9989 )PATH +* + ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN +* +* GE: general matrices +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN +* +* GB: general banded matrices +* + LA = ( 2*KDMAX+1 )*NMAX + LAFAC = ( 3*KDMAX+1 )*NMAX + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, + $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN +* +* GT: general tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN +* +* PO: positive definite matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN +* +* PS: positive semi-definite matrices +* + NTYPES = 9 +* + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, + $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN +* +* PP: positive definite packed matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN +* +* PB: positive definite banded matrices +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN +* +* PT: positive definite tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN +* +* SY: symmetric indefinite matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN +* +* SR: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN +* +* SK: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than SR path version. +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* SA: symmetric indefinite matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* +* + ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN +* +* SA: symmetric indefinite matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, + $ NNS, NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSY_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* +* + ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN +* +* SP: symmetric indefinite packed matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN +* +* TR: triangular matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN +* +* TP: triangular packed matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN +* +* TB: triangular banded matrices +* + NTYPES = 17 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN +* +* QR: QR factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN +* +* LQ: LQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN +* +* QL: QL factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN +* +* RQ: RQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN +* +* QP: QR factorization with pivoting +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 3 ), WORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN +* +* TZ: Trapezoidal matrix +* + NTYPES = 3 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 3 ), WORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN +* +* LS: Least squares drivers +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTDRV ) THEN + CALL DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, + $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ RWORK, RWORK( NMAX+1 ), NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN +* +* EQ: Equilibration routines for general and positive definite +* matrices (THREQ should be between 2 and 10) +* + IF( TSTCHK ) THEN + CALL DCHKEQ( THREQ, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN +* +* QT: QRT routines for general matrices +* + IF( TSTCHK ) THEN + CALL DCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN +* +* QX: QRT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL DCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN +* +* TQ: LQT routines for general matrices +* + IF( TSTCHK ) THEN + CALL DCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN +* +* XQ: LQT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL DCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN +* +* TS: QR routines for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL DCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF +* + ELSE + +* + WRITE( NOUT, FMT = 9990 )PATH + END IF +* +* Go back to get another input line. +* + GO TO 80 +* +* Branch to this line when the last record is read. +* + 140 CONTINUE + CLOSE ( NIN ) + S2 = DSECND( ) + WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9998 FORMAT( / ' End of tests' ) + 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', + $ I6 ) + 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', + $ I6 ) + 9994 FORMAT( ' Tests of the DOUBLE PRECISION LAPACK routines ', + $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, + $ / / ' The following parameter values will be used:' ) + 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) + 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9991 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) + 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) + 9989 FORMAT( / 1X, A3, ' routines were not tested' ) + 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) +* +* End of DCHKAA +* + END diff --git a/lapack-netlib/TESTING/LIN/schkaa.F b/lapack-netlib/TESTING/LIN/schkaa.F new file mode 100644 index 000000000..a5b826d06 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/schkaa.F @@ -0,0 +1,1074 @@ +*> \brief \b SCHKAA +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM SCHKAA +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SCHKAA is the main test program for the REAL LAPACK +*> linear equation routines +*> +*> The program must be driven by a short data file. The first 15 records +*> (not including the first comment line) specify problem dimensions +*> and program options using list-directed input. The remaining lines +*> specify the LAPACK test paths and the number of matrix types to use +*> in testing. An annotated example of a data file can be obtained by +*> deleting the first 3 characters from the following 40 lines: +*> Data file for testing REAL LAPACK linear eqn. routines +*> 7 Number of values of M +*> 0 1 2 3 5 10 16 Values of M (row dimension) +*> 7 Number of values of N +*> 0 1 2 3 5 10 16 Values of N (column dimension) +*> 1 Number of values of NRHS +*> 2 Values of NRHS (number of right hand sides) +*> 5 Number of values of NB +*> 1 3 3 3 20 Values of NB (the blocksize) +*> 1 0 5 9 1 Values of NX (crossover point) +*> 3 Number of values of RANK +*> 30 50 90 Values of rank (as a % of N) +*> 20.0 Threshold value of test ratio +*> T Put T to test the LAPACK routines +*> T Put T to test the driver routines +*> T Put T to test the error exits +*> SGE 11 List types on next line if 0 < NTYPES < 11 +*> SGB 8 List types on next line if 0 < NTYPES < 8 +*> SGT 12 List types on next line if 0 < NTYPES < 12 +*> SPO 9 List types on next line if 0 < NTYPES < 9 +*> SPS 9 List types on next line if 0 < NTYPES < 9 +*> SPP 9 List types on next line if 0 < NTYPES < 9 +*> SPB 8 List types on next line if 0 < NTYPES < 8 +*> SPT 12 List types on next line if 0 < NTYPES < 12 +*> SSY 10 List types on next line if 0 < NTYPES < 10 +*> SSR 10 List types on next line if 0 < NTYPES < 10 +*> SSK 10 List types on next line if 0 < NTYPES < 10 +*> SSA 10 List types on next line if 0 < NTYPES < 10 +*> SS2 10 List types on next line if 0 < NTYPES < 10 +*> SSP 10 List types on next line if 0 < NTYPES < 10 +*> STR 18 List types on next line if 0 < NTYPES < 18 +*> STP 18 List types on next line if 0 < NTYPES < 18 +*> STB 17 List types on next line if 0 < NTYPES < 17 +*> SQR 8 List types on next line if 0 < NTYPES < 8 +*> SRQ 8 List types on next line if 0 < NTYPES < 8 +*> SLQ 8 List types on next line if 0 < NTYPES < 8 +*> SQL 8 List types on next line if 0 < NTYPES < 8 +*> SQP 6 List types on next line if 0 < NTYPES < 6 +*> STZ 3 List types on next line if 0 < NTYPES < 3 +*> SLS 6 List types on next line if 0 < NTYPES < 6 +*> SEQ +*> SQT +*> SQX +*> STS +*> SHH +*> \endverbatim +* +* Parameters: +* ========== +* +*> \verbatim +*> NMAX INTEGER +*> The maximum allowable value for M and N. +*> +*> MAXIN INTEGER +*> The number of different values that can be used for each of +*> M, N, NRHS, NB, NX and RANK +*> +*> MAXRHS INTEGER +*> The maximum number of right hand sides +*> +*> MATMAX INTEGER +*> The maximum number of matrix types to use for testing +*> +*> NIN INTEGER +*> The unit number for input +*> +*> NOUT INTEGER +*> The unit number for output +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup single_lin +* +* ===================================================================== + PROGRAM SCHKAA +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER MAXIN + PARAMETER ( MAXIN = 12 ) + INTEGER MAXRHS + PARAMETER ( MAXRHS = 16 ) + INTEGER MATMAX + PARAMETER ( MATMAX = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER KDMAX + PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) +* .. +* .. Local Scalars .. + LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*2 C2 + CHARACTER*3 PATH + CHARACTER*10 INTSTR + CHARACTER*72 ALINE + INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, + $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + REAL EPS, S1, S2, THREQ, THRESH +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MATMAX ) + INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), + $ NBVAL( MAXIN ), NBVAL2( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ RANKVAL( MAXIN ), PIV( NMAX ) + REAL E( NMAX ), S( 2*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK + REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK +* .. +* .. External Functions .. + LOGICAL LSAME, LSAMEN + REAL SECOND, SLAMCH + EXTERNAL LSAME, LSAMEN, SECOND, SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ, + $ SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP, + $ SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP, + $ SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA, + $ SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE, + $ SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT, + $ SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK, + $ SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP, + $ SCHKLQT, SCHKTSQR +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Arrays in Common .. + INTEGER IPARMS( 100 ) +* .. +* .. Common blocks .. + COMMON / CLAENV / IPARMS + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA THREQ / 2.0E0 / , INTSTR / '0123456789' / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. +* .. Executable Statements .. +* + S1 = SECOND( ) + LDA = NMAX + FATAL = .FALSE. +* +* Read a dummy line. +* + READ( NIN, FMT = * ) +* +* Report values of parameters. +* + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH +* +* Read the values of M +* + READ( NIN, FMT = * )NM + IF( NM.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 + NM = 0 + FATAL = .TRUE. + ELSE IF( NM.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN + NM = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) + DO 10 I = 1, NM + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 10 CONTINUE + IF( NM.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) +* +* Read the values of N +* + READ( NIN, FMT = * )NN + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 20 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + IF( NN.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) +* +* Read the values of NRHS +* + READ( NIN, FMT = * )NNS + IF( NNS.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 + NNS = 0 + FATAL = .TRUE. + ELSE IF( NNS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN + NNS = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) + DO 30 I = 1, NNS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN + WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS + FATAL = .TRUE. + END IF + 30 CONTINUE + IF( NNS.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) +* +* Read the values of NB +* + READ( NIN, FMT = * )NNB + IF( NNB.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 + NNB = 0 + FATAL = .TRUE. + ELSE IF( NNB.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN + NNB = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) + DO 40 I = 1, NNB + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + END IF + 40 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) +* +* Set NBVAL2 to be the set of unique values of NB +* + NNB2 = 0 + DO 60 I = 1, NNB + NB = NBVAL( I ) + DO 50 J = 1, NNB2 + IF( NB.EQ.NBVAL2( J ) ) + $ GO TO 60 + 50 CONTINUE + NNB2 = NNB2 + 1 + NBVAL2( NNB2 ) = NB + 60 CONTINUE +* +* Read the values of NX +* + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) + DO 70 I = 1, NNB + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + END IF + 70 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) +* +* Read the values of RANKVAL +* + READ( NIN, FMT = * )NRANK + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 + NRANK = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN + NRANK = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) + DO I = 1, NRANK + IF( RANKVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( RANKVAL( I ).GT.100 ) THEN + WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 + FATAL = .TRUE. + END IF + END DO + IF( NRANK.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', + $ ( RANKVAL( I ), I = 1, NRANK ) +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9992 )THRESH +* +* Read the flag that indicates whether to test the LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test the driver routines. +* + READ( NIN, FMT = * )TSTDRV +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Calculate and print the machine dependent constants. +* + EPS = SLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9991 )'underflow', EPS + EPS = SLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9991 )'overflow ', EPS + EPS = SLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9991 )'precision', EPS + WRITE( NOUT, FMT = * ) +* + 80 CONTINUE +* +* Read a test path and the number of matrix types to use. +* + READ( NIN, FMT = '(A72)', END = 140 )ALINE + PATH = ALINE( 1: 3 ) + NMATS = MATMAX + I = 3 + 90 CONTINUE + I = I + 1 + IF( I.GT.72 ) THEN + NMATS = MATMAX + GO TO 130 + END IF + IF( ALINE( I: I ).EQ.' ' ) + $ GO TO 90 + NMATS = 0 + 100 CONTINUE + C1 = ALINE( I: I ) + DO 110 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 120 + END IF + 110 CONTINUE + GO TO 130 + 120 CONTINUE + NMATS = NMATS*10 + IC + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + GO TO 100 + 130 CONTINUE + C1 = PATH( 1: 1 ) + C2 = PATH( 2: 3 ) + NRHS = NSVAL( 1 ) +* +* Check first character for correct precision. +* + IF( .NOT.LSAME( C1, 'Single precision' ) ) THEN + WRITE( NOUT, FMT = 9990 )PATH +* + ELSE IF( NMATS.LE.0 ) THEN +* +* Check for a positive number of tests requested. +* + WRITE( NOUT, FMT = 9989 )PATH +* + ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN +* +* GE: general matrices +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN +* +* GB: general banded matrices +* + LA = ( 2*KDMAX+1 )*NMAX + LAFAC = ( 3*KDMAX+1 )*NMAX + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, + $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN +* +* GT: general tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN +* +* PO: positive definite matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN +* +* PS: positive semi-definite matrices +* + NTYPES = 9 +* + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, + $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN +* +* PP: positive definite packed matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN +* +* PB: positive definite banded matrices +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN +* +* PT: positive definite tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN +* +* SY: symmetric indefinite matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN +* +* SR: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN +* +* SK: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than SR path version. +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* SA: symmetric indefinite matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN +* +* SA: symmetric indefinite matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, + $ NNS, NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSY_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN +* +* SP: symmetric indefinite packed matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN +* +* TR: triangular matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN +* +* TP: triangular packed matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN +* +* TB: triangular banded matrices +* + NTYPES = 17 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN +* +* QR: QR factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN +* +* LQ: LQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN +* +* QL: QL factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN +* +* RQ: RQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN +* +* QP: QR factorization with pivoting +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 3 ), WORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN +* +* TZ: Trapezoidal matrix +* + NTYPES = 3 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 3 ), WORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN +* +* LS: Least squares drivers +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTDRV ) THEN + CALL SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, + $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ RWORK, RWORK( NMAX+1 ), NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN +* +* EQ: Equilibration routines for general and positive definite +* matrices (THREQ should be between 2 and 10) +* + IF( TSTCHK ) THEN + CALL SCHKEQ( THREQ, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN +* +* QT: QRT routines for general matrices +* + IF( TSTCHK ) THEN + CALL SCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN +* +* QX: QRT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL SCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN +* +* TQ: LQT routines for general matrices +* + IF( TSTCHK ) THEN + CALL SCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN +* +* XQ: LQT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL SCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN +* +* TS: QR routines for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL SCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF +* + ELSE +* + WRITE( NOUT, FMT = 9990 )PATH + END IF +* +* Go back to get another input line. +* + GO TO 80 +* +* Branch to this line when the last record is read. +* + 140 CONTINUE + CLOSE ( NIN ) + S2 = SECOND( ) + WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9998 FORMAT( / ' End of tests' ) + 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', + $ I6 ) + 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', + $ I6 ) + 9994 FORMAT( ' Tests of the REAL LAPACK routines ', + $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, + $ / / ' The following parameter values will be used:' ) + 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) + 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9991 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) + 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) + 9989 FORMAT( / 1X, A3, ' routines were not tested' ) + 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) +* +* End of SCHKAA +* + END diff --git a/lapack-netlib/TESTING/LIN/zchkaa.F b/lapack-netlib/TESTING/LIN/zchkaa.F new file mode 100644 index 000000000..a118515a5 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zchkaa.F @@ -0,0 +1,1271 @@ +*> \brief \b ZCHKAA +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZCHKAA +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZCHKAA is the main test program for the COMPLEX*16 linear equation +*> routines. +*> +*> The program must be driven by a short data file. The first 15 records +*> (not including the first comment line) specify problem dimensions +*> and program options using list-directed input. The remaining lines +*> specify the LAPACK test paths and the number of matrix types to use +*> in testing. An annotated example of a data file can be obtained by +*> deleting the first 3 characters from the following 42 lines: +*> Data file for testing COMPLEX*16 LAPACK linear equation routines +*> 7 Number of values of M +*> 0 1 2 3 5 10 16 Values of M (row dimension) +*> 7 Number of values of N +*> 0 1 2 3 5 10 16 Values of N (column dimension) +*> 1 Number of values of NRHS +*> 2 Values of NRHS (number of right hand sides) +*> 5 Number of values of NB +*> 1 3 3 3 20 Values of NB (the blocksize) +*> 1 0 5 9 1 Values of NX (crossover point) +*> 3 Number of values of RANK +*> 30 50 90 Values of rank (as a % of N) +*> 30.0 Threshold value of test ratio +*> T Put T to test the LAPACK routines +*> T Put T to test the driver routines +*> T Put T to test the error exits +*> ZGE 11 List types on next line if 0 < NTYPES < 11 +*> ZGB 8 List types on next line if 0 < NTYPES < 8 +*> ZGT 12 List types on next line if 0 < NTYPES < 12 +*> ZPO 9 List types on next line if 0 < NTYPES < 9 +*> ZPS 9 List types on next line if 0 < NTYPES < 9 +*> ZPP 9 List types on next line if 0 < NTYPES < 9 +*> ZPB 8 List types on next line if 0 < NTYPES < 8 +*> ZPT 12 List types on next line if 0 < NTYPES < 12 +*> ZHE 10 List types on next line if 0 < NTYPES < 10 +*> ZHR 10 List types on next line if 0 < NTYPES < 10 +*> ZHK 10 List types on next line if 0 < NTYPES < 10 +*> ZHA 10 List types on next line if 0 < NTYPES < 10 +*> ZH2 10 List types on next line if 0 < NTYPES < 10 +*> ZSA 11 List types on next line if 0 < NTYPES < 10 +*> ZS2 11 List types on next line if 0 < NTYPES < 10 +*> ZHP 10 List types on next line if 0 < NTYPES < 10 +*> ZSY 11 List types on next line if 0 < NTYPES < 11 +*> ZSR 11 List types on next line if 0 < NTYPES < 11 +*> ZSK 11 List types on next line if 0 < NTYPES < 11 +*> ZSP 11 List types on next line if 0 < NTYPES < 11 +*> ZTR 18 List types on next line if 0 < NTYPES < 18 +*> ZTP 18 List types on next line if 0 < NTYPES < 18 +*> ZTB 17 List types on next line if 0 < NTYPES < 17 +*> ZQR 8 List types on next line if 0 < NTYPES < 8 +*> ZRQ 8 List types on next line if 0 < NTYPES < 8 +*> ZLQ 8 List types on next line if 0 < NTYPES < 8 +*> ZQL 8 List types on next line if 0 < NTYPES < 8 +*> ZQP 6 List types on next line if 0 < NTYPES < 6 +*> ZTZ 3 List types on next line if 0 < NTYPES < 3 +*> ZLS 6 List types on next line if 0 < NTYPES < 6 +*> ZEQ +*> ZQT +*> ZQX +*> ZTS +*> ZHH +*> \endverbatim +* +* Parameters: +* ========== +* +*> \verbatim +*> NMAX INTEGER +*> The maximum allowable value for M and N. +*> +*> MAXIN INTEGER +*> The number of different values that can be used for each of +*> M, N, NRHS, NB, NX and RANK +*> +*> MAXRHS INTEGER +*> The maximum number of right hand sides +*> +*> MATMAX INTEGER +*> The maximum number of matrix types to use for testing +*> +*> NIN INTEGER +*> The unit number for input +*> +*> NOUT INTEGER +*> The unit number for output +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16_lin +* +* ===================================================================== + PROGRAM ZCHKAA +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER MAXIN + PARAMETER ( MAXIN = 12 ) + INTEGER MAXRHS + PARAMETER ( MAXRHS = 16 ) + INTEGER MATMAX + PARAMETER ( MATMAX = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER KDMAX + PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) +* .. +* .. Local Scalars .. + LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*2 C2 + CHARACTER*3 PATH + CHARACTER*10 INTSTR + CHARACTER*72 ALINE + INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, + $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + DOUBLE PRECISION EPS, S1, S2, THREQ, THRESH +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MATMAX ) + INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), + $ NBVAL( MAXIN ), NBVAL2( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ RANKVAL( MAXIN ), PIV( NMAX ) + DOUBLE PRECISION S( 2*NMAX ) + COMPLEX*16 E( NMAX ) +* +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE:: RWORK + COMPLEX*16, DIMENSION(:,:), ALLOCATABLE:: A, B, WORK +* .. +* .. External Functions .. + LOGICAL LSAME, LSAMEN + DOUBLE PRECISION DLAMCH, DSECND + EXTERNAL LSAME, LSAMEN, DLAMCH, DSECND +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE, + $ ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP, + $ ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS, + $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ, + $ ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK, + $ ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ, + $ ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK, + $ ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP, + $ ZDRVLS, ZDRVPB, ZDRVPO, ZDRVPP, ZDRVPT, + $ ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK, + $ ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, + $ ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Arrays in Common .. + INTEGER IPARMS( 100 ) +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT + COMMON / CLAENV / IPARMS +* .. +* .. Data statements .. + DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / +* +* .. Allocate memory dynamically .. + ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. +* .. Executable Statements .. +* + S1 = DSECND( ) + LDA = NMAX + FATAL = .FALSE. +* +* Read a dummy line. +* + READ( NIN, FMT = * ) +* +* Report values of parameters. +* + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH +* +* Read the values of M +* + READ( NIN, FMT = * )NM + IF( NM.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 + NM = 0 + FATAL = .TRUE. + ELSE IF( NM.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN + NM = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) + DO 10 I = 1, NM + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 10 CONTINUE + IF( NM.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) +* +* Read the values of N +* + READ( NIN, FMT = * )NN + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 20 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + IF( NN.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) +* +* Read the values of NRHS +* + READ( NIN, FMT = * )NNS + IF( NNS.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 + NNS = 0 + FATAL = .TRUE. + ELSE IF( NNS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN + NNS = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) + DO 30 I = 1, NNS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN + WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS + FATAL = .TRUE. + END IF + 30 CONTINUE + IF( NNS.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) +* +* Read the values of NB +* + READ( NIN, FMT = * )NNB + IF( NNB.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 + NNB = 0 + FATAL = .TRUE. + ELSE IF( NNB.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN + NNB = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) + DO 40 I = 1, NNB + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + END IF + 40 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) +* +* Set NBVAL2 to be the set of unique values of NB +* + NNB2 = 0 + DO 60 I = 1, NNB + NB = NBVAL( I ) + DO 50 J = 1, NNB2 + IF( NB.EQ.NBVAL2( J ) ) + $ GO TO 60 + 50 CONTINUE + NNB2 = NNB2 + 1 + NBVAL2( NNB2 ) = NB + 60 CONTINUE +* +* Read the values of NX +* + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) + DO 70 I = 1, NNB + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + END IF + 70 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) +* +* Read the values of RANKVAL +* + READ( NIN, FMT = * )NRANK + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 + NRANK = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN + NRANK = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) + DO I = 1, NRANK + IF( RANKVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( RANKVAL( I ).GT.100 ) THEN + WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 + FATAL = .TRUE. + END IF + END DO + IF( NRANK.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', + $ ( RANKVAL( I ), I = 1, NRANK ) +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9992 )THRESH +* +* Read the flag that indicates whether to test the LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test the driver routines. +* + READ( NIN, FMT = * )TSTDRV +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Calculate and print the machine dependent constants. +* + EPS = DLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9991 )'underflow', EPS + EPS = DLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9991 )'overflow ', EPS + EPS = DLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9991 )'precision', EPS + WRITE( NOUT, FMT = * ) + NRHS = NSVAL( 1 ) +* + 80 CONTINUE +* +* Read a test path and the number of matrix types to use. +* + READ( NIN, FMT = '(A72)', END = 140 )ALINE + PATH = ALINE( 1: 3 ) + NMATS = MATMAX + I = 3 + 90 CONTINUE + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + IF( ALINE( I: I ).EQ.' ' ) + $ GO TO 90 + NMATS = 0 + 100 CONTINUE + C1 = ALINE( I: I ) + DO 110 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 120 + END IF + 110 CONTINUE + GO TO 130 + 120 CONTINUE + NMATS = NMATS*10 + IC + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + GO TO 100 + 130 CONTINUE + C1 = PATH( 1: 1 ) + C2 = PATH( 2: 3 ) +* +* Check first character for correct precision. +* + IF( .NOT.LSAME( C1, 'Zomplex precision' ) ) THEN + WRITE( NOUT, FMT = 9990 )PATH +* + ELSE IF( NMATS.LE.0 ) THEN +* +* Check for a positive number of tests requested. +* + WRITE( NOUT, FMT = 9989 )PATH +* + ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN +* +* GE: general matrices +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN +* +* GB: general banded matrices +* + LA = ( 2*KDMAX+1 )*NMAX + LAFAC = ( 3*KDMAX+1 )*NMAX + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, + $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN +* +* GT: general tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN +* +* PO: positive definite matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN +* +* PS: positive semi-definite matrices +* + NTYPES = 9 +* + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, + $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN +* +* PP: positive definite packed matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN +* +* PB: positive definite banded matrices +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN +* +* PT: positive definite tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HE' ) ) THEN +* +* HE: Hermitian indefinite matrices +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF + + ELSE IF( LSAMEN( 2, C2, 'HR' ) ) THEN +* +* HR: Hermitian indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHE_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHE_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HK' ) ) THEN +* +* HK: Hermitian indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than HR path version. +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHE_RK ( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHE_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN +* +* HA: Hermitian matrices, +* Aasen Algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHE_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHE_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN +* +* H2: Hermitian matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, + $ NNS, NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHE_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* +* + ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN +* +* HP: Hermitian indefinite packed matrices +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN +* +* SY: symmetric indefinite matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN +* +* SR: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN +* +* SK: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than SR path version. +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* SA: symmetric indefinite matrices with Aasen's algorithm, +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN +* +* S2: symmetric indefinite matrices with Aasen's algorithm +* 2 stage +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSY_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN +* +* SP: symmetric indefinite packed matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN +* +* TR: triangular matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN +* +* TP: triangular packed matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN +* +* TB: triangular banded matrices +* + NTYPES = 17 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN +* +* QR: QR factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN +* +* LQ: LQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN +* +* QL: QL factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN +* +* RQ: RQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN +* +* EQ: Equilibration routines for general and positive definite +* matrices (THREQ should be between 2 and 10) +* + IF( TSTCHK ) THEN + CALL ZCHKEQ( THREQ, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN +* +* TZ: Trapezoidal matrix +* + NTYPES = 3 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), S( 1 ), + $ B( 1, 1 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN +* +* QP: QR factorization with pivoting +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ THRESH, A( 1, 1 ), A( 1, 2 ), S( 1 ), + $ B( 1, 1 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN +* +* LS: Least squares drivers +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTDRV ) THEN + CALL ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, + $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ S( 1 ), S( NMAX+1 ), NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* +* + ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN +* +* QT: QRT routines for general matrices +* + IF( TSTCHK ) THEN + CALL ZCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN +* +* QX: QRT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL ZCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN +* +* TQ: LQT routines for general matrices +* + IF( TSTCHK ) THEN + CALL ZCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN +* +* XQ: LQT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL ZCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN +* +* TS: QR routines for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL ZCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN +* +* TQ: LQT routines for general matrices +* + IF( TSTCHK ) THEN + CALL ZCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN +* +* XQ: LQT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL ZCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN +* +* TS: QR routines for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL ZCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF +* + ELSE +* + WRITE( NOUT, FMT = 9990 )PATH + END IF +* +* Go back to get another input line. +* + GO TO 80 +* +* Branch to this line when the last record is read. +* + 140 CONTINUE + CLOSE ( NIN ) + S2 = DSECND( ) + WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9998 FORMAT( / ' End of tests' ) + 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', + $ I6 ) + 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', + $ I6 ) + 9994 FORMAT( ' Tests of the COMPLEX*16 LAPACK routines ', + $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, + $ / / ' The following parameter values will be used:' ) + 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) + 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9991 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) + 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) + 9989 FORMAT( / 1X, A3, ' routines were not tested' ) + 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) +* +* End of ZCHKAA +* + END From 2c7d4a77664ca2657d0ff496fa100557a2813b06 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 19:53:38 +0200 Subject: [PATCH 016/108] Delete cchkaa.f --- lapack-netlib/TESTING/LIN/cchkaa.f | 1220 ---------------------------- 1 file changed, 1220 deletions(-) delete mode 100644 lapack-netlib/TESTING/LIN/cchkaa.f diff --git a/lapack-netlib/TESTING/LIN/cchkaa.f b/lapack-netlib/TESTING/LIN/cchkaa.f deleted file mode 100644 index d36770be7..000000000 --- a/lapack-netlib/TESTING/LIN/cchkaa.f +++ /dev/null @@ -1,1220 +0,0 @@ -*> \brief \b CCHKAA -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM CCHKAA -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> CCHKAA is the main test program for the COMPLEX linear equation -*> routines. -*> -*> The program must be driven by a short data file. The first 15 records -*> (not including the first comment line) specify problem dimensions -*> and program options using list-directed input. The remaining lines -*> specify the LAPACK test paths and the number of matrix types to use -*> in testing. An annotated example of a data file can be obtained by -*> deleting the first 3 characters from the following 42 lines: -*> Data file for testing COMPLEX LAPACK linear equation routines -*> 7 Number of values of M -*> 0 1 2 3 5 10 16 Values of M (row dimension) -*> 7 Number of values of N -*> 0 1 2 3 5 10 16 Values of N (column dimension) -*> 1 Number of values of NRHS -*> 2 Values of NRHS (number of right hand sides) -*> 5 Number of values of NB -*> 1 3 3 3 20 Values of NB (the blocksize) -*> 1 0 5 9 1 Values of NX (crossover point) -*> 3 Number of values of RANK -*> 30 50 90 Values of rank (as a % of N) -*> 30.0 Threshold value of test ratio -*> T Put T to test the LAPACK routines -*> T Put T to test the driver routines -*> T Put T to test the error exits -*> CGE 11 List types on next line if 0 < NTYPES < 11 -*> CGB 8 List types on next line if 0 < NTYPES < 8 -*> CGT 12 List types on next line if 0 < NTYPES < 12 -*> CPO 9 List types on next line if 0 < NTYPES < 9 -*> CPO 9 List types on next line if 0 < NTYPES < 9 -*> CPP 9 List types on next line if 0 < NTYPES < 9 -*> CPB 8 List types on next line if 0 < NTYPES < 8 -*> CPT 12 List types on next line if 0 < NTYPES < 12 -*> CHE 10 List types on next line if 0 < NTYPES < 10 -*> CHR 10 List types on next line if 0 < NTYPES < 10 -*> CHK 10 List types on next line if 0 < NTYPES < 10 -*> CHA 10 List types on next line if 0 < NTYPES < 10 -*> CH2 10 List types on next line if 0 < NTYPES < 10 -*> CSA 11 List types on next line if 0 < NTYPES < 10 -*> CS2 11 List types on next line if 0 < NTYPES < 10 -*> CHP 10 List types on next line if 0 < NTYPES < 10 -*> CSY 11 List types on next line if 0 < NTYPES < 11 -*> CSK 11 List types on next line if 0 < NTYPES < 11 -*> CSR 11 List types on next line if 0 < NTYPES < 11 -*> CSP 11 List types on next line if 0 < NTYPES < 11 -*> CTR 18 List types on next line if 0 < NTYPES < 18 -*> CTP 18 List types on next line if 0 < NTYPES < 18 -*> CTB 17 List types on next line if 0 < NTYPES < 17 -*> CQR 8 List types on next line if 0 < NTYPES < 8 -*> CRQ 8 List types on next line if 0 < NTYPES < 8 -*> CLQ 8 List types on next line if 0 < NTYPES < 8 -*> CQL 8 List types on next line if 0 < NTYPES < 8 -*> CQP 6 List types on next line if 0 < NTYPES < 6 -*> CTZ 3 List types on next line if 0 < NTYPES < 3 -*> CLS 6 List types on next line if 0 < NTYPES < 6 -*> CEQ -*> CQT -*> CQX -*> CTS -*> CHH -*> \endverbatim -* -* Parameters: -* ========== -* -*> \verbatim -*> NMAX INTEGER -*> The maximum allowable value for M and N. -*> -*> MAXIN INTEGER -*> The number of different values that can be used for each of -*> M, N, NRHS, NB, NX and RANK -*> -*> MAXRHS INTEGER -*> The maximum number of right hand sides -*> -*> MATMAX INTEGER -*> The maximum number of matrix types to use for testing -*> -*> NIN INTEGER -*> The unit number for input -*> -*> NOUT INTEGER -*> The unit number for output -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2019 -* -*> \ingroup complex_lin -* -* ===================================================================== - PROGRAM CCHKAA -* -* -- LAPACK test routine (version 3.9.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2017 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER MAXIN - PARAMETER ( MAXIN = 12 ) - INTEGER MAXRHS - PARAMETER ( MAXRHS = 16 ) - INTEGER MATMAX - PARAMETER ( MATMAX = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) - INTEGER KDMAX - PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) -* .. -* .. Local Scalars .. - LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*2 C2 - CHARACTER*3 PATH - CHARACTER*10 INTSTR - CHARACTER*72 ALINE - INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, - $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - REAL EPS, S1, S2, THREQ, THRESH -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MATMAX ) - INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), - $ NBVAL( MAXIN ), NBVAL2( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL RWORK( 150*NMAX+2*MAXRHS ), S( 2*NMAX ) - COMPLEX A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), WORK( NMAX, NMAX+MAXRHS+10 ) -* .. -* .. External Functions .. - LOGICAL LSAME, LSAMEN - REAL SECOND, SLAMCH - EXTERNAL LSAME, LSAMEN, SECOND, SLAMCH -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE, - $ CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP, - $ CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS, - $ CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ, - $ CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK, - $ CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, CCHKTZ, - $ CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK, - $ CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB, - $ CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY, - $ CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER, - $ CCHKQRT, CCHKQRTP -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, NUNIT -* .. -* .. Arrays in Common .. - INTEGER IPARMS( 100 ) -* .. -* .. Common blocks .. - COMMON / CLAENV / IPARMS - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT -* .. -* .. Data statements .. - DATA THREQ / 2.0 / , INTSTR / '0123456789' / -* .. -* .. Executable Statements .. -* - S1 = SECOND( ) - LDA = NMAX - FATAL = .FALSE. -* -* Read a dummy line. -* - READ( NIN, FMT = * ) -* -* Report values of parameters. -* - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH -* -* Read the values of M -* - READ( NIN, FMT = * )NM - IF( NM.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 - NM = 0 - FATAL = .TRUE. - ELSE IF( NM.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN - NM = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) - DO 10 I = 1, NM - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 10 CONTINUE - IF( NM.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) -* -* Read the values of N -* - READ( NIN, FMT = * )NN - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 20 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - IF( NN.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) -* -* Read the values of NRHS -* - READ( NIN, FMT = * )NNS - IF( NNS.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 - NNS = 0 - FATAL = .TRUE. - ELSE IF( NNS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN - NNS = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) - DO 30 I = 1, NNS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN - WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS - FATAL = .TRUE. - END IF - 30 CONTINUE - IF( NNS.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) -* -* Read the values of NB -* - READ( NIN, FMT = * )NNB - IF( NNB.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 - NNB = 0 - FATAL = .TRUE. - ELSE IF( NNB.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN - NNB = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) - DO 40 I = 1, NNB - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - END IF - 40 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) -* -* Set NBVAL2 to be the set of unique values of NB -* - NNB2 = 0 - DO 60 I = 1, NNB - NB = NBVAL( I ) - DO 50 J = 1, NNB2 - IF( NB.EQ.NBVAL2( J ) ) - $ GO TO 60 - 50 CONTINUE - NNB2 = NNB2 + 1 - NBVAL2( NNB2 ) = NB - 60 CONTINUE -* -* Read the values of NX -* - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) - DO 70 I = 1, NNB - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - END IF - 70 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) -* -* Read the values of RANKVAL -* - READ( NIN, FMT = * )NRANK - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 - NRANK = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN - NRANK = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) - DO I = 1, NRANK - IF( RANKVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( RANKVAL( I ).GT.100 ) THEN - WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 - FATAL = .TRUE. - END IF - END DO - IF( NRANK.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', - $ ( RANKVAL( I ), I = 1, NRANK ) -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9992 )THRESH -* -* Read the flag that indicates whether to test the LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test the driver routines. -* - READ( NIN, FMT = * )TSTDRV -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Calculate and print the machine dependent constants. -* - EPS = SLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9991 )'underflow', EPS - EPS = SLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9991 )'overflow ', EPS - EPS = SLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9991 )'precision', EPS - WRITE( NOUT, FMT = * ) - NRHS = NSVAL( 1 ) -* - 80 CONTINUE -* -* Read a test path and the number of matrix types to use. -* - READ( NIN, FMT = '(A72)', END = 140 )ALINE - PATH = ALINE( 1: 3 ) - NMATS = MATMAX - I = 3 - 90 CONTINUE - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - IF( ALINE( I: I ).EQ.' ' ) - $ GO TO 90 - NMATS = 0 - 100 CONTINUE - C1 = ALINE( I: I ) - DO 110 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 120 - END IF - 110 CONTINUE - GO TO 130 - 120 CONTINUE - NMATS = NMATS*10 + IC - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - GO TO 100 - 130 CONTINUE - C1 = PATH( 1: 1 ) - C2 = PATH( 2: 3 ) -* -* Check first character for correct precision. -* - IF( .NOT.LSAME( C1, 'Complex precision' ) ) THEN - WRITE( NOUT, FMT = 9990 )PATH -* - ELSE IF( NMATS.LE.0 ) THEN -* -* Check for a positive number of tests requested. -* - WRITE( NOUT, FMT = 9989 )PATH -* - ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN -* -* GE: general matrices -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN -* -* GB: general banded matrices -* - LA = ( 2*KDMAX+1 )*NMAX - LAFAC = ( 3*KDMAX+1 )*NMAX - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, - $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN -* -* GT: general tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN -* -* PO: positive definite matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN -* -* PS: positive semi-definite matrices -* - NTYPES = 9 -* - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, - $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN -* -* PP: positive definite packed matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN -* -* PB: positive definite banded matrices -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN -* -* PT: positive definite tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HE' ) ) THEN -* -* HE: Hermitian indefinite matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HR' ) ) THEN -* -* HR: Hermitian indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHE_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHE_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HK' ) ) THEN -* -* HK: Hermitian indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* different matrix storage format than HR path version. -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHE_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHE_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN -* -* HA: Hermitian matrices, -* Aasen Algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHE_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHE_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN -* -* H2: Hermitian matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, - $ NNS, NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHE_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN -* -* HP: Hermitian indefinite packed matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN -* -* SY: symmetric indefinite matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN -* -* SR: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN -* -* SK: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* different matrix storage format than SR path version. -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN -* -* SA: symmetric indefinite matrices with Aasen's algorithm, -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN -* -* S2: symmetric indefinite matrices with Aasen's algorithm -* 2 stage -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSY_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN -* -* SP: symmetric indefinite packed matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN -* -* TR: triangular matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN -* -* TP: triangular packed matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN -* -* TB: triangular banded matrices -* - NTYPES = 17 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN -* -* QR: QR factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN -* -* LQ: LQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN -* -* QL: QL factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN -* -* RQ: RQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN -* -* EQ: Equilibration routines for general and positive definite -* matrices (THREQ should be between 2 and 10) -* - IF( TSTCHK ) THEN - CALL CCHKEQ( THREQ, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN -* -* TZ: Trapezoidal matrix -* - NTYPES = 3 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), S( 1 ), - $ B( 1, 1 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN -* -* QP: QR factorization with pivoting -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), S( 1 ), - $ B( 1, 1 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN -* -* LS: Least squares drivers -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTDRV ) THEN - CALL CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, - $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ S( 1 ), S( NMAX+1 ), NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN -* -* QT: QRT routines for general matrices -* - IF( TSTCHK ) THEN - CALL CCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN -* -* QX: QRT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL CCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN -* -* TQ: LQT routines for general matrices -* - IF( TSTCHK ) THEN - CALL CCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN -* -* XQ: LQT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL CCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN -* -* TS: QR routines for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL CCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN -* -* HH: Householder reconstruction for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 ) PATH - END IF -* - ELSE -* - WRITE( NOUT, FMT = 9990 )PATH - END IF -* -* Go back to get another input line. -* - GO TO 80 -* -* Branch to this line when the last record is read. -* - 140 CONTINUE - CLOSE ( NIN ) - S2 = SECOND( ) - WRITE( NOUT, FMT = 9998 ) - WRITE( NOUT, FMT = 9997 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9998 FORMAT( / ' End of tests' ) - 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', - $ I6 ) - 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', - $ I6 ) - 9994 FORMAT( ' Tests of the COMPLEX LAPACK routines ', - $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, - $ / / ' The following parameter values will be used:' ) - 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) - 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9991 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) - 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) - 9989 FORMAT( / 1X, A3, ' routines were not tested' ) - 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) -* -* End of CCHKAA -* - END From 93cc066921f97b9d593b2c8fa258b54f34fb5510 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 19:54:13 +0200 Subject: [PATCH 017/108] Delete dchkaa.f --- lapack-netlib/TESTING/LIN/dchkaa.f | 1063 ---------------------------- 1 file changed, 1063 deletions(-) delete mode 100644 lapack-netlib/TESTING/LIN/dchkaa.f diff --git a/lapack-netlib/TESTING/LIN/dchkaa.f b/lapack-netlib/TESTING/LIN/dchkaa.f deleted file mode 100644 index 03575c4d1..000000000 --- a/lapack-netlib/TESTING/LIN/dchkaa.f +++ /dev/null @@ -1,1063 +0,0 @@ -*> \brief \b DCHKAA -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM DCHKAA -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> DCHKAA is the main test program for the DOUBLE PRECISION LAPACK -*> linear equation routines -*> -*> The program must be driven by a short data file. The first 15 records -*> (not including the first comment line) specify problem dimensions -*> and program options using list-directed input. The remaining lines -*> specify the LAPACK test paths and the number of matrix types to use -*> in testing. An annotated example of a data file can be obtained by -*> deleting the first 3 characters from the following 40 lines: -*> Data file for testing DOUBLE PRECISION LAPACK linear eqn. routines -*> 7 Number of values of M -*> 0 1 2 3 5 10 16 Values of M (row dimension) -*> 7 Number of values of N -*> 0 1 2 3 5 10 16 Values of N (column dimension) -*> 1 Number of values of NRHS -*> 2 Values of NRHS (number of right hand sides) -*> 5 Number of values of NB -*> 1 3 3 3 20 Values of NB (the blocksize) -*> 1 0 5 9 1 Values of NX (crossover point) -*> 3 Number of values of RANK -*> 30 50 90 Values of rank (as a % of N) -*> 20.0 Threshold value of test ratio -*> T Put T to test the LAPACK routines -*> T Put T to test the driver routines -*> T Put T to test the error exits -*> DGE 11 List types on next line if 0 < NTYPES < 11 -*> DGB 8 List types on next line if 0 < NTYPES < 8 -*> DGT 12 List types on next line if 0 < NTYPES < 12 -*> DPO 9 List types on next line if 0 < NTYPES < 9 -*> DPS 9 List types on next line if 0 < NTYPES < 9 -*> DPP 9 List types on next line if 0 < NTYPES < 9 -*> DPB 8 List types on next line if 0 < NTYPES < 8 -*> DPT 12 List types on next line if 0 < NTYPES < 12 -*> DSY 10 List types on next line if 0 < NTYPES < 10 -*> DSR 10 List types on next line if 0 < NTYPES < 10 -*> DSK 10 List types on next line if 0 < NTYPES < 10 -*> DSA 10 List types on next line if 0 < NTYPES < 10 -*> DS2 10 List types on next line if 0 < NTYPES < 10 -*> DSP 10 List types on next line if 0 < NTYPES < 10 -*> DTR 18 List types on next line if 0 < NTYPES < 18 -*> DTP 18 List types on next line if 0 < NTYPES < 18 -*> DTB 17 List types on next line if 0 < NTYPES < 17 -*> DQR 8 List types on next line if 0 < NTYPES < 8 -*> DRQ 8 List types on next line if 0 < NTYPES < 8 -*> DLQ 8 List types on next line if 0 < NTYPES < 8 -*> DQL 8 List types on next line if 0 < NTYPES < 8 -*> DQP 6 List types on next line if 0 < NTYPES < 6 -*> DTZ 3 List types on next line if 0 < NTYPES < 3 -*> DLS 6 List types on next line if 0 < NTYPES < 6 -*> DEQ -*> DQT -*> DQX -*> DTQ -*> DXQ -*> DTS -*> DHH -*> \endverbatim -* -* Parameters: -* ========== -* -*> \verbatim -*> NMAX INTEGER -*> The maximum allowable value for M and N. -*> -*> MAXIN INTEGER -*> The number of different values that can be used for each of -*> M, N, NRHS, NB, NX and RANK -*> -*> MAXRHS INTEGER -*> The maximum number of right hand sides -*> -*> MATMAX INTEGER -*> The maximum number of matrix types to use for testing -*> -*> NIN INTEGER -*> The unit number for input -*> -*> NOUT INTEGER -*> The unit number for output -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2019 -* -*> \ingroup double_lin -* -* ===================================================================== - PROGRAM DCHKAA -* -* -- LAPACK test routine (version 3.9.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* Novemebr 2019 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER MAXIN - PARAMETER ( MAXIN = 12 ) - INTEGER MAXRHS - PARAMETER ( MAXRHS = 16 ) - INTEGER MATMAX - PARAMETER ( MATMAX = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) - INTEGER KDMAX - PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) -* .. -* .. Local Scalars .. - LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*2 C2 - CHARACTER*3 PATH - CHARACTER*10 INTSTR - CHARACTER*72 ALINE - INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, - $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - DOUBLE PRECISION EPS, S1, S2, THREQ, THRESH -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MATMAX ) - INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), - $ NBVAL( MAXIN ), NBVAL2( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), RWORK( 5*NMAX+2*MAXRHS ), - $ S( 2*NMAX ), WORK( NMAX, 3*NMAX+MAXRHS+30 ) -* .. -* .. External Functions .. - LOGICAL LSAME, LSAMEN - DOUBLE PRECISION DLAMCH, DSECND - EXTERNAL LSAME, LSAMEN, DLAMCH, DSECND -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ, - $ DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP, - $ DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP, - $ DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA, - $ DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE, - $ DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT, - $ DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK, - $ DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP, - $ DCHKLQT,DCHKTSQR -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, NUNIT -* .. -* .. Arrays in Common .. - INTEGER IPARMS( 100 ) -* .. -* .. Common blocks .. - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT - COMMON / CLAENV / IPARMS -* .. -* .. Data statements .. - DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / -* .. -* .. Executable Statements .. -* - S1 = DSECND( ) - LDA = NMAX - FATAL = .FALSE. -* -* Read a dummy line. -* - READ( NIN, FMT = * ) -* -* Report values of parameters. -* - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH -* -* Read the values of M -* - READ( NIN, FMT = * )NM - IF( NM.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 - NM = 0 - FATAL = .TRUE. - ELSE IF( NM.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN - NM = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) - DO 10 I = 1, NM - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 10 CONTINUE - IF( NM.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) -* -* Read the values of N -* - READ( NIN, FMT = * )NN - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 20 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - IF( NN.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) -* -* Read the values of NRHS -* - READ( NIN, FMT = * )NNS - IF( NNS.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 - NNS = 0 - FATAL = .TRUE. - ELSE IF( NNS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN - NNS = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) - DO 30 I = 1, NNS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN - WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS - FATAL = .TRUE. - END IF - 30 CONTINUE - IF( NNS.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) -* -* Read the values of NB -* - READ( NIN, FMT = * )NNB - IF( NNB.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 - NNB = 0 - FATAL = .TRUE. - ELSE IF( NNB.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN - NNB = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) - DO 40 I = 1, NNB - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - END IF - 40 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) -* -* Set NBVAL2 to be the set of unique values of NB -* - NNB2 = 0 - DO 60 I = 1, NNB - NB = NBVAL( I ) - DO 50 J = 1, NNB2 - IF( NB.EQ.NBVAL2( J ) ) - $ GO TO 60 - 50 CONTINUE - NNB2 = NNB2 + 1 - NBVAL2( NNB2 ) = NB - 60 CONTINUE -* -* Read the values of NX -* - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) - DO 70 I = 1, NNB - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - END IF - 70 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) -* -* Read the values of RANKVAL -* - READ( NIN, FMT = * )NRANK - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 - NRANK = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN - NRANK = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) - DO I = 1, NRANK - IF( RANKVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( RANKVAL( I ).GT.100 ) THEN - WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 - FATAL = .TRUE. - END IF - END DO - IF( NRANK.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', - $ ( RANKVAL( I ), I = 1, NRANK ) -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9992 )THRESH -* -* Read the flag that indicates whether to test the LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test the driver routines. -* - READ( NIN, FMT = * )TSTDRV -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Calculate and print the machine dependent constants. -* - EPS = DLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9991 )'underflow', EPS - EPS = DLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9991 )'overflow ', EPS - EPS = DLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9991 )'precision', EPS - WRITE( NOUT, FMT = * ) -* - 80 CONTINUE -* -* Read a test path and the number of matrix types to use. -* - READ( NIN, FMT = '(A72)', END = 140 )ALINE - PATH = ALINE( 1: 3 ) - NMATS = MATMAX - I = 3 - 90 CONTINUE - I = I + 1 - IF( I.GT.72 ) THEN - NMATS = MATMAX - GO TO 130 - END IF - IF( ALINE( I: I ).EQ.' ' ) - $ GO TO 90 - NMATS = 0 - 100 CONTINUE - C1 = ALINE( I: I ) - DO 110 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 120 - END IF - 110 CONTINUE - GO TO 130 - 120 CONTINUE - NMATS = NMATS*10 + IC - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - GO TO 100 - 130 CONTINUE - C1 = PATH( 1: 1 ) - C2 = PATH( 2: 3 ) - NRHS = NSVAL( 1 ) -* -* Check first character for correct precision. -* - IF( .NOT.LSAME( C1, 'Double precision' ) ) THEN - WRITE( NOUT, FMT = 9990 )PATH -* - ELSE IF( NMATS.LE.0 ) THEN -* -* Check for a positive number of tests requested. -* - WRITE( NOUT, FMT = 9989 )PATH -* - ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN -* -* GE: general matrices -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN -* -* GB: general banded matrices -* - LA = ( 2*KDMAX+1 )*NMAX - LAFAC = ( 3*KDMAX+1 )*NMAX - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, - $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN -* -* GT: general tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN -* -* PO: positive definite matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN -* -* PS: positive semi-definite matrices -* - NTYPES = 9 -* - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, - $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN -* -* PP: positive definite packed matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN -* -* PB: positive definite banded matrices -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN -* -* PT: positive definite tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN -* -* SY: symmetric indefinite matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN -* -* SR: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN -* -* SK: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* differnet matrix storage format than SR path version. -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN -* -* SA: symmetric indefinite matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* -* - ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN -* -* SA: symmetric indefinite matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, - $ NNS, NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSY_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* -* - ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN -* -* SP: symmetric indefinite packed matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN -* -* TR: triangular matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN -* -* TP: triangular packed matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN -* -* TB: triangular banded matrices -* - NTYPES = 17 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN -* -* QR: QR factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN -* -* LQ: LQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN -* -* QL: QL factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN -* -* RQ: RQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN -* -* QP: QR factorization with pivoting -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN -* -* TZ: Trapezoidal matrix -* - NTYPES = 3 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN -* -* LS: Least squares drivers -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTDRV ) THEN - CALL DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, - $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), - $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ RWORK, RWORK( NMAX+1 ), NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN -* -* EQ: Equilibration routines for general and positive definite -* matrices (THREQ should be between 2 and 10) -* - IF( TSTCHK ) THEN - CALL DCHKEQ( THREQ, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN -* -* QT: QRT routines for general matrices -* - IF( TSTCHK ) THEN - CALL DCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN -* -* QX: QRT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL DCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN -* -* TQ: LQT routines for general matrices -* - IF( TSTCHK ) THEN - CALL DCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN -* -* XQ: LQT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL DCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN -* -* TS: QR routines for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL DCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN -* -* HH: Householder reconstruction for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 ) PATH - END IF -* - ELSE - -* - WRITE( NOUT, FMT = 9990 )PATH - END IF -* -* Go back to get another input line. -* - GO TO 80 -* -* Branch to this line when the last record is read. -* - 140 CONTINUE - CLOSE ( NIN ) - S2 = DSECND( ) - WRITE( NOUT, FMT = 9998 ) - WRITE( NOUT, FMT = 9997 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9998 FORMAT( / ' End of tests' ) - 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', - $ I6 ) - 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', - $ I6 ) - 9994 FORMAT( ' Tests of the DOUBLE PRECISION LAPACK routines ', - $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, - $ / / ' The following parameter values will be used:' ) - 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) - 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9991 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) - 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) - 9989 FORMAT( / 1X, A3, ' routines were not tested' ) - 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) -* -* End of DCHKAA -* - END From f7bcd962c19ec997514ec65f0222713405ac6dea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 19:54:54 +0200 Subject: [PATCH 018/108] Delete schkaa.f --- lapack-netlib/TESTING/LIN/schkaa.f | 1058 ---------------------------- 1 file changed, 1058 deletions(-) delete mode 100644 lapack-netlib/TESTING/LIN/schkaa.f diff --git a/lapack-netlib/TESTING/LIN/schkaa.f b/lapack-netlib/TESTING/LIN/schkaa.f deleted file mode 100644 index a9c13e442..000000000 --- a/lapack-netlib/TESTING/LIN/schkaa.f +++ /dev/null @@ -1,1058 +0,0 @@ -*> \brief \b SCHKAA -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM SCHKAA -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> SCHKAA is the main test program for the REAL LAPACK -*> linear equation routines -*> -*> The program must be driven by a short data file. The first 15 records -*> (not including the first comment line) specify problem dimensions -*> and program options using list-directed input. The remaining lines -*> specify the LAPACK test paths and the number of matrix types to use -*> in testing. An annotated example of a data file can be obtained by -*> deleting the first 3 characters from the following 40 lines: -*> Data file for testing REAL LAPACK linear eqn. routines -*> 7 Number of values of M -*> 0 1 2 3 5 10 16 Values of M (row dimension) -*> 7 Number of values of N -*> 0 1 2 3 5 10 16 Values of N (column dimension) -*> 1 Number of values of NRHS -*> 2 Values of NRHS (number of right hand sides) -*> 5 Number of values of NB -*> 1 3 3 3 20 Values of NB (the blocksize) -*> 1 0 5 9 1 Values of NX (crossover point) -*> 3 Number of values of RANK -*> 30 50 90 Values of rank (as a % of N) -*> 20.0 Threshold value of test ratio -*> T Put T to test the LAPACK routines -*> T Put T to test the driver routines -*> T Put T to test the error exits -*> SGE 11 List types on next line if 0 < NTYPES < 11 -*> SGB 8 List types on next line if 0 < NTYPES < 8 -*> SGT 12 List types on next line if 0 < NTYPES < 12 -*> SPO 9 List types on next line if 0 < NTYPES < 9 -*> SPS 9 List types on next line if 0 < NTYPES < 9 -*> SPP 9 List types on next line if 0 < NTYPES < 9 -*> SPB 8 List types on next line if 0 < NTYPES < 8 -*> SPT 12 List types on next line if 0 < NTYPES < 12 -*> SSY 10 List types on next line if 0 < NTYPES < 10 -*> SSR 10 List types on next line if 0 < NTYPES < 10 -*> SSK 10 List types on next line if 0 < NTYPES < 10 -*> SSA 10 List types on next line if 0 < NTYPES < 10 -*> SS2 10 List types on next line if 0 < NTYPES < 10 -*> SSP 10 List types on next line if 0 < NTYPES < 10 -*> STR 18 List types on next line if 0 < NTYPES < 18 -*> STP 18 List types on next line if 0 < NTYPES < 18 -*> STB 17 List types on next line if 0 < NTYPES < 17 -*> SQR 8 List types on next line if 0 < NTYPES < 8 -*> SRQ 8 List types on next line if 0 < NTYPES < 8 -*> SLQ 8 List types on next line if 0 < NTYPES < 8 -*> SQL 8 List types on next line if 0 < NTYPES < 8 -*> SQP 6 List types on next line if 0 < NTYPES < 6 -*> STZ 3 List types on next line if 0 < NTYPES < 3 -*> SLS 6 List types on next line if 0 < NTYPES < 6 -*> SEQ -*> SQT -*> SQX -*> STS -*> SHH -*> \endverbatim -* -* Parameters: -* ========== -* -*> \verbatim -*> NMAX INTEGER -*> The maximum allowable value for M and N. -*> -*> MAXIN INTEGER -*> The number of different values that can be used for each of -*> M, N, NRHS, NB, NX and RANK -*> -*> MAXRHS INTEGER -*> The maximum number of right hand sides -*> -*> MATMAX INTEGER -*> The maximum number of matrix types to use for testing -*> -*> NIN INTEGER -*> The unit number for input -*> -*> NOUT INTEGER -*> The unit number for output -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2019 -* -*> \ingroup single_lin -* -* ===================================================================== - PROGRAM SCHKAA -* -* -- LAPACK test routine (version 3.9.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER MAXIN - PARAMETER ( MAXIN = 12 ) - INTEGER MAXRHS - PARAMETER ( MAXRHS = 16 ) - INTEGER MATMAX - PARAMETER ( MATMAX = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) - INTEGER KDMAX - PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) -* .. -* .. Local Scalars .. - LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*2 C2 - CHARACTER*3 PATH - CHARACTER*10 INTSTR - CHARACTER*72 ALINE - INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, - $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - REAL EPS, S1, S2, THREQ, THRESH -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MATMAX ) - INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), - $ NBVAL( MAXIN ), NBVAL2( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), RWORK( 5*NMAX+2*MAXRHS ), - $ S( 2*NMAX ), WORK( NMAX, NMAX+MAXRHS+30 ) -* .. -* .. External Functions .. - LOGICAL LSAME, LSAMEN - REAL SECOND, SLAMCH - EXTERNAL LSAME, LSAMEN, SECOND, SLAMCH -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ, - $ SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP, - $ SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP, - $ SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA, - $ SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE, - $ SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT, - $ SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK, - $ SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP, - $ SCHKLQT, SCHKTSQR -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, NUNIT -* .. -* .. Arrays in Common .. - INTEGER IPARMS( 100 ) -* .. -* .. Common blocks .. - COMMON / CLAENV / IPARMS - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT -* .. -* .. Data statements .. - DATA THREQ / 2.0E0 / , INTSTR / '0123456789' / -* .. -* .. Executable Statements .. -* - S1 = SECOND( ) - LDA = NMAX - FATAL = .FALSE. -* -* Read a dummy line. -* - READ( NIN, FMT = * ) -* -* Report values of parameters. -* - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH -* -* Read the values of M -* - READ( NIN, FMT = * )NM - IF( NM.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 - NM = 0 - FATAL = .TRUE. - ELSE IF( NM.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN - NM = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) - DO 10 I = 1, NM - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 10 CONTINUE - IF( NM.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) -* -* Read the values of N -* - READ( NIN, FMT = * )NN - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 20 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - IF( NN.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) -* -* Read the values of NRHS -* - READ( NIN, FMT = * )NNS - IF( NNS.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 - NNS = 0 - FATAL = .TRUE. - ELSE IF( NNS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN - NNS = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) - DO 30 I = 1, NNS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN - WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS - FATAL = .TRUE. - END IF - 30 CONTINUE - IF( NNS.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) -* -* Read the values of NB -* - READ( NIN, FMT = * )NNB - IF( NNB.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 - NNB = 0 - FATAL = .TRUE. - ELSE IF( NNB.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN - NNB = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) - DO 40 I = 1, NNB - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - END IF - 40 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) -* -* Set NBVAL2 to be the set of unique values of NB -* - NNB2 = 0 - DO 60 I = 1, NNB - NB = NBVAL( I ) - DO 50 J = 1, NNB2 - IF( NB.EQ.NBVAL2( J ) ) - $ GO TO 60 - 50 CONTINUE - NNB2 = NNB2 + 1 - NBVAL2( NNB2 ) = NB - 60 CONTINUE -* -* Read the values of NX -* - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) - DO 70 I = 1, NNB - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - END IF - 70 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) -* -* Read the values of RANKVAL -* - READ( NIN, FMT = * )NRANK - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 - NRANK = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN - NRANK = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) - DO I = 1, NRANK - IF( RANKVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( RANKVAL( I ).GT.100 ) THEN - WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 - FATAL = .TRUE. - END IF - END DO - IF( NRANK.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', - $ ( RANKVAL( I ), I = 1, NRANK ) -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9992 )THRESH -* -* Read the flag that indicates whether to test the LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test the driver routines. -* - READ( NIN, FMT = * )TSTDRV -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Calculate and print the machine dependent constants. -* - EPS = SLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9991 )'underflow', EPS - EPS = SLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9991 )'overflow ', EPS - EPS = SLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9991 )'precision', EPS - WRITE( NOUT, FMT = * ) -* - 80 CONTINUE -* -* Read a test path and the number of matrix types to use. -* - READ( NIN, FMT = '(A72)', END = 140 )ALINE - PATH = ALINE( 1: 3 ) - NMATS = MATMAX - I = 3 - 90 CONTINUE - I = I + 1 - IF( I.GT.72 ) THEN - NMATS = MATMAX - GO TO 130 - END IF - IF( ALINE( I: I ).EQ.' ' ) - $ GO TO 90 - NMATS = 0 - 100 CONTINUE - C1 = ALINE( I: I ) - DO 110 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 120 - END IF - 110 CONTINUE - GO TO 130 - 120 CONTINUE - NMATS = NMATS*10 + IC - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - GO TO 100 - 130 CONTINUE - C1 = PATH( 1: 1 ) - C2 = PATH( 2: 3 ) - NRHS = NSVAL( 1 ) -* -* Check first character for correct precision. -* - IF( .NOT.LSAME( C1, 'Single precision' ) ) THEN - WRITE( NOUT, FMT = 9990 )PATH -* - ELSE IF( NMATS.LE.0 ) THEN -* -* Check for a positive number of tests requested. -* - WRITE( NOUT, FMT = 9989 )PATH -* - ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN -* -* GE: general matrices -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN -* -* GB: general banded matrices -* - LA = ( 2*KDMAX+1 )*NMAX - LAFAC = ( 3*KDMAX+1 )*NMAX - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, - $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN -* -* GT: general tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN -* -* PO: positive definite matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN -* -* PS: positive semi-definite matrices -* - NTYPES = 9 -* - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, - $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN -* -* PP: positive definite packed matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN -* -* PB: positive definite banded matrices -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN -* -* PT: positive definite tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN -* -* SY: symmetric indefinite matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN -* -* SR: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN -* -* SK: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* different matrix storage format than SR path version. -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN -* -* SA: symmetric indefinite matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN -* -* SA: symmetric indefinite matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, - $ NNS, NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSY_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN -* -* SP: symmetric indefinite packed matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN -* -* TR: triangular matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN -* -* TP: triangular packed matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN -* -* TB: triangular banded matrices -* - NTYPES = 17 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN -* -* QR: QR factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN -* -* LQ: LQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN -* -* QL: QL factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN -* -* RQ: RQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN -* -* QP: QR factorization with pivoting -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN -* -* TZ: Trapezoidal matrix -* - NTYPES = 3 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN -* -* LS: Least squares drivers -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTDRV ) THEN - CALL SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, - $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), - $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ RWORK, RWORK( NMAX+1 ), NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN -* -* EQ: Equilibration routines for general and positive definite -* matrices (THREQ should be between 2 and 10) -* - IF( TSTCHK ) THEN - CALL SCHKEQ( THREQ, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN -* -* QT: QRT routines for general matrices -* - IF( TSTCHK ) THEN - CALL SCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN -* -* QX: QRT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL SCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN -* -* TQ: LQT routines for general matrices -* - IF( TSTCHK ) THEN - CALL SCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN -* -* XQ: LQT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL SCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN -* -* TS: QR routines for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL SCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN -* -* HH: Householder reconstruction for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 ) PATH - END IF -* - ELSE -* - WRITE( NOUT, FMT = 9990 )PATH - END IF -* -* Go back to get another input line. -* - GO TO 80 -* -* Branch to this line when the last record is read. -* - 140 CONTINUE - CLOSE ( NIN ) - S2 = SECOND( ) - WRITE( NOUT, FMT = 9998 ) - WRITE( NOUT, FMT = 9997 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9998 FORMAT( / ' End of tests' ) - 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', - $ I6 ) - 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', - $ I6 ) - 9994 FORMAT( ' Tests of the REAL LAPACK routines ', - $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, - $ / / ' The following parameter values will be used:' ) - 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) - 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9991 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) - 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) - 9989 FORMAT( / 1X, A3, ' routines were not tested' ) - 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) -* -* End of SCHKAA -* - END From 15b9d6b4a70aed9e7010aea7009f14f1098e11c1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 19:55:31 +0200 Subject: [PATCH 019/108] Delete zchkaa.f --- lapack-netlib/TESTING/LIN/zchkaa.f | 1255 ---------------------------- 1 file changed, 1255 deletions(-) delete mode 100644 lapack-netlib/TESTING/LIN/zchkaa.f diff --git a/lapack-netlib/TESTING/LIN/zchkaa.f b/lapack-netlib/TESTING/LIN/zchkaa.f deleted file mode 100644 index 30d2a084a..000000000 --- a/lapack-netlib/TESTING/LIN/zchkaa.f +++ /dev/null @@ -1,1255 +0,0 @@ -*> \brief \b ZCHKAA -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM ZCHKAA -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> ZCHKAA is the main test program for the COMPLEX*16 linear equation -*> routines. -*> -*> The program must be driven by a short data file. The first 15 records -*> (not including the first comment line) specify problem dimensions -*> and program options using list-directed input. The remaining lines -*> specify the LAPACK test paths and the number of matrix types to use -*> in testing. An annotated example of a data file can be obtained by -*> deleting the first 3 characters from the following 42 lines: -*> Data file for testing COMPLEX*16 LAPACK linear equation routines -*> 7 Number of values of M -*> 0 1 2 3 5 10 16 Values of M (row dimension) -*> 7 Number of values of N -*> 0 1 2 3 5 10 16 Values of N (column dimension) -*> 1 Number of values of NRHS -*> 2 Values of NRHS (number of right hand sides) -*> 5 Number of values of NB -*> 1 3 3 3 20 Values of NB (the blocksize) -*> 1 0 5 9 1 Values of NX (crossover point) -*> 3 Number of values of RANK -*> 30 50 90 Values of rank (as a % of N) -*> 30.0 Threshold value of test ratio -*> T Put T to test the LAPACK routines -*> T Put T to test the driver routines -*> T Put T to test the error exits -*> ZGE 11 List types on next line if 0 < NTYPES < 11 -*> ZGB 8 List types on next line if 0 < NTYPES < 8 -*> ZGT 12 List types on next line if 0 < NTYPES < 12 -*> ZPO 9 List types on next line if 0 < NTYPES < 9 -*> ZPS 9 List types on next line if 0 < NTYPES < 9 -*> ZPP 9 List types on next line if 0 < NTYPES < 9 -*> ZPB 8 List types on next line if 0 < NTYPES < 8 -*> ZPT 12 List types on next line if 0 < NTYPES < 12 -*> ZHE 10 List types on next line if 0 < NTYPES < 10 -*> ZHR 10 List types on next line if 0 < NTYPES < 10 -*> ZHK 10 List types on next line if 0 < NTYPES < 10 -*> ZHA 10 List types on next line if 0 < NTYPES < 10 -*> ZH2 10 List types on next line if 0 < NTYPES < 10 -*> ZSA 11 List types on next line if 0 < NTYPES < 10 -*> ZS2 11 List types on next line if 0 < NTYPES < 10 -*> ZHP 10 List types on next line if 0 < NTYPES < 10 -*> ZSY 11 List types on next line if 0 < NTYPES < 11 -*> ZSR 11 List types on next line if 0 < NTYPES < 11 -*> ZSK 11 List types on next line if 0 < NTYPES < 11 -*> ZSP 11 List types on next line if 0 < NTYPES < 11 -*> ZTR 18 List types on next line if 0 < NTYPES < 18 -*> ZTP 18 List types on next line if 0 < NTYPES < 18 -*> ZTB 17 List types on next line if 0 < NTYPES < 17 -*> ZQR 8 List types on next line if 0 < NTYPES < 8 -*> ZRQ 8 List types on next line if 0 < NTYPES < 8 -*> ZLQ 8 List types on next line if 0 < NTYPES < 8 -*> ZQL 8 List types on next line if 0 < NTYPES < 8 -*> ZQP 6 List types on next line if 0 < NTYPES < 6 -*> ZTZ 3 List types on next line if 0 < NTYPES < 3 -*> ZLS 6 List types on next line if 0 < NTYPES < 6 -*> ZEQ -*> ZQT -*> ZQX -*> ZTS -*> ZHH -*> \endverbatim -* -* Parameters: -* ========== -* -*> \verbatim -*> NMAX INTEGER -*> The maximum allowable value for M and N. -*> -*> MAXIN INTEGER -*> The number of different values that can be used for each of -*> M, N, NRHS, NB, NX and RANK -*> -*> MAXRHS INTEGER -*> The maximum number of right hand sides -*> -*> MATMAX INTEGER -*> The maximum number of matrix types to use for testing -*> -*> NIN INTEGER -*> The unit number for input -*> -*> NOUT INTEGER -*> The unit number for output -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2019 -* -*> \ingroup complex16_lin -* -* ===================================================================== - PROGRAM ZCHKAA -* -* -- LAPACK test routine (version 3.9.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER MAXIN - PARAMETER ( MAXIN = 12 ) - INTEGER MAXRHS - PARAMETER ( MAXRHS = 16 ) - INTEGER MATMAX - PARAMETER ( MATMAX = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) - INTEGER KDMAX - PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) -* .. -* .. Local Scalars .. - LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*2 C2 - CHARACTER*3 PATH - CHARACTER*10 INTSTR - CHARACTER*72 ALINE - INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, - $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - DOUBLE PRECISION EPS, S1, S2, THREQ, THRESH -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MATMAX ) - INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), - $ NBVAL( MAXIN ), NBVAL2( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION RWORK( 150*NMAX+2*MAXRHS ), S( 2*NMAX ) - COMPLEX*16 A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), WORK( NMAX, NMAX+MAXRHS+10 ) -* .. -* .. External Functions .. - LOGICAL LSAME, LSAMEN - DOUBLE PRECISION DLAMCH, DSECND - EXTERNAL LSAME, LSAMEN, DLAMCH, DSECND -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE, - $ ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP, - $ ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS, - $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ, - $ ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK, - $ ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ, - $ ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK, - $ ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP, - $ ZDRVLS, ZDRVPB, ZDRVPO, ZDRVPP, ZDRVPT, - $ ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK, - $ ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, - $ ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, NUNIT -* .. -* .. Arrays in Common .. - INTEGER IPARMS( 100 ) -* .. -* .. Common blocks .. - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT - COMMON / CLAENV / IPARMS -* .. -* .. Data statements .. - DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / -* .. -* .. Executable Statements .. -* - S1 = DSECND( ) - LDA = NMAX - FATAL = .FALSE. -* -* Read a dummy line. -* - READ( NIN, FMT = * ) -* -* Report values of parameters. -* - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH -* -* Read the values of M -* - READ( NIN, FMT = * )NM - IF( NM.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 - NM = 0 - FATAL = .TRUE. - ELSE IF( NM.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN - NM = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) - DO 10 I = 1, NM - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 10 CONTINUE - IF( NM.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) -* -* Read the values of N -* - READ( NIN, FMT = * )NN - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 20 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - IF( NN.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) -* -* Read the values of NRHS -* - READ( NIN, FMT = * )NNS - IF( NNS.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 - NNS = 0 - FATAL = .TRUE. - ELSE IF( NNS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN - NNS = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) - DO 30 I = 1, NNS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN - WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS - FATAL = .TRUE. - END IF - 30 CONTINUE - IF( NNS.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) -* -* Read the values of NB -* - READ( NIN, FMT = * )NNB - IF( NNB.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 - NNB = 0 - FATAL = .TRUE. - ELSE IF( NNB.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN - NNB = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) - DO 40 I = 1, NNB - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - END IF - 40 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) -* -* Set NBVAL2 to be the set of unique values of NB -* - NNB2 = 0 - DO 60 I = 1, NNB - NB = NBVAL( I ) - DO 50 J = 1, NNB2 - IF( NB.EQ.NBVAL2( J ) ) - $ GO TO 60 - 50 CONTINUE - NNB2 = NNB2 + 1 - NBVAL2( NNB2 ) = NB - 60 CONTINUE -* -* Read the values of NX -* - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) - DO 70 I = 1, NNB - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - END IF - 70 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) -* -* Read the values of RANKVAL -* - READ( NIN, FMT = * )NRANK - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 - NRANK = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN - NRANK = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) - DO I = 1, NRANK - IF( RANKVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( RANKVAL( I ).GT.100 ) THEN - WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 - FATAL = .TRUE. - END IF - END DO - IF( NRANK.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', - $ ( RANKVAL( I ), I = 1, NRANK ) -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9992 )THRESH -* -* Read the flag that indicates whether to test the LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test the driver routines. -* - READ( NIN, FMT = * )TSTDRV -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Calculate and print the machine dependent constants. -* - EPS = DLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9991 )'underflow', EPS - EPS = DLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9991 )'overflow ', EPS - EPS = DLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9991 )'precision', EPS - WRITE( NOUT, FMT = * ) - NRHS = NSVAL( 1 ) -* - 80 CONTINUE -* -* Read a test path and the number of matrix types to use. -* - READ( NIN, FMT = '(A72)', END = 140 )ALINE - PATH = ALINE( 1: 3 ) - NMATS = MATMAX - I = 3 - 90 CONTINUE - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - IF( ALINE( I: I ).EQ.' ' ) - $ GO TO 90 - NMATS = 0 - 100 CONTINUE - C1 = ALINE( I: I ) - DO 110 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 120 - END IF - 110 CONTINUE - GO TO 130 - 120 CONTINUE - NMATS = NMATS*10 + IC - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - GO TO 100 - 130 CONTINUE - C1 = PATH( 1: 1 ) - C2 = PATH( 2: 3 ) -* -* Check first character for correct precision. -* - IF( .NOT.LSAME( C1, 'Zomplex precision' ) ) THEN - WRITE( NOUT, FMT = 9990 )PATH -* - ELSE IF( NMATS.LE.0 ) THEN -* -* Check for a positive number of tests requested. -* - WRITE( NOUT, FMT = 9989 )PATH -* - ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN -* -* GE: general matrices -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN -* -* GB: general banded matrices -* - LA = ( 2*KDMAX+1 )*NMAX - LAFAC = ( 3*KDMAX+1 )*NMAX - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, - $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN -* -* GT: general tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN -* -* PO: positive definite matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN -* -* PS: positive semi-definite matrices -* - NTYPES = 9 -* - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, - $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN -* -* PP: positive definite packed matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN -* -* PB: positive definite banded matrices -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN -* -* PT: positive definite tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HE' ) ) THEN -* -* HE: Hermitian indefinite matrices -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF - - ELSE IF( LSAMEN( 2, C2, 'HR' ) ) THEN -* -* HR: Hermitian indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHE_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHE_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HK' ) ) THEN -* -* HK: Hermitian indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* different matrix storage format than HR path version. -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHE_RK ( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHE_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN -* -* HA: Hermitian matrices, -* Aasen Algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHE_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHE_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN -* -* H2: Hermitian matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, - $ NNS, NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHE_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* -* - ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN -* -* HP: Hermitian indefinite packed matrices -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN -* -* SY: symmetric indefinite matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN -* -* SR: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN -* -* SK: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* different matrix storage format than SR path version. -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN -* -* SA: symmetric indefinite matrices with Aasen's algorithm, -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN -* -* S2: symmetric indefinite matrices with Aasen's algorithm -* 2 stage -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSY_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN -* -* SP: symmetric indefinite packed matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN -* -* TR: triangular matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN -* -* TP: triangular packed matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN -* -* TB: triangular banded matrices -* - NTYPES = 17 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN -* -* QR: QR factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN -* -* LQ: LQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN -* -* QL: QL factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN -* -* RQ: RQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN -* -* EQ: Equilibration routines for general and positive definite -* matrices (THREQ should be between 2 and 10) -* - IF( TSTCHK ) THEN - CALL ZCHKEQ( THREQ, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN -* -* TZ: Trapezoidal matrix -* - NTYPES = 3 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), S( 1 ), - $ B( 1, 1 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN -* -* QP: QR factorization with pivoting -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), S( 1 ), - $ B( 1, 1 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN -* -* LS: Least squares drivers -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTDRV ) THEN - CALL ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, - $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ S( 1 ), S( NMAX+1 ), NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* -* - ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN -* -* QT: QRT routines for general matrices -* - IF( TSTCHK ) THEN - CALL ZCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN -* -* QX: QRT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL ZCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN -* -* TQ: LQT routines for general matrices -* - IF( TSTCHK ) THEN - CALL ZCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN -* -* XQ: LQT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL ZCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN -* -* TS: QR routines for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL ZCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN -* -* TQ: LQT routines for general matrices -* - IF( TSTCHK ) THEN - CALL ZCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN -* -* XQ: LQT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL ZCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN -* -* TS: QR routines for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL ZCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN -* -* HH: Householder reconstruction for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 ) PATH - END IF -* - ELSE -* - WRITE( NOUT, FMT = 9990 )PATH - END IF -* -* Go back to get another input line. -* - GO TO 80 -* -* Branch to this line when the last record is read. -* - 140 CONTINUE - CLOSE ( NIN ) - S2 = DSECND( ) - WRITE( NOUT, FMT = 9998 ) - WRITE( NOUT, FMT = 9997 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9998 FORMAT( / ' End of tests' ) - 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', - $ I6 ) - 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', - $ I6 ) - 9994 FORMAT( ' Tests of the COMPLEX*16 LAPACK routines ', - $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, - $ / / ' The following parameter values will be used:' ) - 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) - 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9991 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) - 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) - 9989 FORMAT( / 1X, A3, ' routines were not tested' ) - 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) -* -* End of ZCHKAA -* - END From 26e87ac517edd08ac8da373e6cba4584d65479a2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 20:39:55 +0200 Subject: [PATCH 020/108] Support Intel Ice Lake SP as Cooper Lake --- cpuid_x86.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 44704fcd9..18ff122e5 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1398,6 +1398,17 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 10: // Ice Lake SP + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; case 7: // family 6 exmodel 7 @@ -2112,7 +2123,22 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; -#endif +#endif + if (model == 10) +#ifndef NO_AVX512 + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + return CORE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif break; case 7: if (model == 10) From cbfd3c87e17f9a3123e25802d07613842f325ca2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 20:44:06 +0200 Subject: [PATCH 021/108] Recognize Intel Ice Lake SP as Cooper Lake --- driver/others/dynamic.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 158e1b3da..46ad06a7c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -621,6 +621,22 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } } + if (model == 10) { + // Ice Lake SP + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } return NULL; case 7: if (model == 10) // Goldmont Plus From c4da892ba0798f8697e7b3219fd631651647e45f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 23:19:10 +0200 Subject: [PATCH 022/108] Only filter out -mavx on Sandybridge ZGEMM/ZTRMM kernels --- kernel/Makefile.L3 | 86 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 18 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index be10ee018..2d9e3ec36 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -818,8 +818,10 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s -else +else ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -828,8 +830,10 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s -else +else ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -838,8 +842,10 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s -else +else ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -848,8 +854,10 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s -else +else ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ endif $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) @@ -1044,8 +1052,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1054,8 +1064,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1064,8 +1076,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1074,8 +1088,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1084,8 +1100,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1094,8 +1112,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1104,8 +1124,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1114,8 +1136,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif else @@ -1187,28 +1211,54 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif endif From 32264ba496e49f774a4fe3d63ff06cac6d37ef62 Mon Sep 17 00:00:00 2001 From: Noan <66834344+dnoan@users.noreply.github.com> Date: Sun, 16 May 2021 09:49:13 +0000 Subject: [PATCH 023/108] Update Makefile.arm64 Added -march and -mtune flags for EMAG processors when GCC 9 or later --- Makefile.arm64 | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index 23362b4e5..3858d7e3f 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -107,4 +107,13 @@ FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif endif endif + +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq ($(CORE), EMAG8180) +CCOMMON_OPT += -march=armv8-a -mtune=emag +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=emag +endif +endif +endif endif From 26ccf643a38ef501981b3dc629a78f3ed4bdd39f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 May 2021 13:04:38 +0200 Subject: [PATCH 024/108] Add -lm for FreeBSD on ARM/ARM64 --- Makefile.system | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile.system b/Makefile.system index ae703e4d9..bffe684d7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -380,6 +380,12 @@ ifeq ($(OSNAME), AIX) EXTRALIB += -lm endif +ifeq ($(OSNAME), FreeBSD) +ifeq ($(ARCH), $(filter ($ARCH),arm arm64)) +EXTRALIB += -lm +endif +endif + ifeq ($(OSNAME), WINNT) NEED_PIC = 0 NO_EXPRECISION = 1 From 5c729c6dce38bda7c870325bc0fcd035ae65f1bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 May 2021 14:47:14 +0200 Subject: [PATCH 025/108] Correct function name in error message from SLASQ2 (Reference-LAPACK PR 555) --- lapack-netlib/SRC/slasq2.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/slasq2.f b/lapack-netlib/SRC/slasq2.f index 219797c4a..c0c71b82e 100644 --- a/lapack-netlib/SRC/slasq2.f +++ b/lapack-netlib/SRC/slasq2.f @@ -185,7 +185,7 @@ * IF( Z( 1 ).LT.ZERO ) THEN INFO = -201 - CALL XERBLA( 'DLASQ2', 2 ) + CALL XERBLA( 'SLASQ2', 2 ) RETURN ELSE IF( Z( 2 ).LT.ZERO ) THEN INFO = -202 From 0e73d206297f5e419647f4d579da8a93e9b730dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 May 2021 14:23:49 +0200 Subject: [PATCH 026/108] Handle inadvertent use of DYNAMIC_ARCH=0 --- Makefile.x86 | 2 +- Makefile.x86_64 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.x86 b/Makefile.x86 index 893379c33..25ca660bd 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -1,6 +1,6 @@ # COMPILER_PREFIX = mingw32- -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH),1) ADD_CPUFLAGS = 1 else ifdef TARGET_CORE diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f62ab9e5e..307cbe1d9 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,7 +9,7 @@ endif endif -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH),1) ADD_CPUFLAGS = 1 else ifdef TARGET_CORE From 3a53207cc9f5907c257359ef37dc3c0df3f62ac2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 May 2021 14:29:45 +0200 Subject: [PATCH 027/108] Fix spurious error exit test failures in the ?chktsqr tests (LAPACK564) --- lapack-netlib/TESTING/LIN/cchktsqr.f | 2 ++ lapack-netlib/TESTING/LIN/dchktsqr.f | 2 ++ lapack-netlib/TESTING/LIN/schktsqr.f | 2 ++ lapack-netlib/TESTING/LIN/zchktsqr.f | 2 ++ 4 files changed, 8 insertions(+) diff --git a/lapack-netlib/TESTING/LIN/cchktsqr.f b/lapack-netlib/TESTING/LIN/cchktsqr.f index 8288916db..62b6ce434 100644 --- a/lapack-netlib/TESTING/LIN/cchktsqr.f +++ b/lapack-netlib/TESTING/LIN/cchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL CERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/dchktsqr.f b/lapack-netlib/TESTING/LIN/dchktsqr.f index c4b1f01bd..14119e6e5 100644 --- a/lapack-netlib/TESTING/LIN/dchktsqr.f +++ b/lapack-netlib/TESTING/LIN/dchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL DERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/schktsqr.f b/lapack-netlib/TESTING/LIN/schktsqr.f index 2bed434a8..aa4d6f9c4 100644 --- a/lapack-netlib/TESTING/LIN/schktsqr.f +++ b/lapack-netlib/TESTING/LIN/schktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL SERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/zchktsqr.f b/lapack-netlib/TESTING/LIN/zchktsqr.f index e6e6ac556..678b1772f 100644 --- a/lapack-netlib/TESTING/LIN/zchktsqr.f +++ b/lapack-netlib/TESTING/LIN/zchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL ZERRTSQR( PATH, NOUT ) INFOT = 0 * From 03297ff9f08d8fe42e4ef93f6f54bd82c6a9f6fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 May 2021 20:41:18 +0200 Subject: [PATCH 028/108] Add fast path for small xSYR with INCX==1 --- interface/syr.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/interface/syr.c b/interface/syr.c index 1374bcc69..ad75264b1 100644 --- a/interface/syr.c +++ b/interface/syr.c @@ -168,7 +168,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, IDEBUG_START; FUNCTION_PROFILE_START(); +#if 1 + if (incx == 1 && n < 100) { + BLASLONG i; + if (uplo == 0) { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0); + } + a += lda; + } + } else { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0); + } + a += 1 + lda; + } + } + return; + } +#endif if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); From 4fbc0777f434aa4e7b8e91279ca0e6bc54ea63a8 Mon Sep 17 00:00:00 2001 From: MikaelUrankar Date: Wed, 26 May 2021 12:14:57 +0200 Subject: [PATCH 029/108] Fix typo --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index bffe684d7..2264b143b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -381,7 +381,7 @@ EXTRALIB += -lm endif ifeq ($(OSNAME), FreeBSD) -ifeq ($(ARCH), $(filter ($ARCH),arm arm64)) +ifeq ($(ARCH), $(filter $(ARCH),arm arm64)) EXTRALIB += -lm endif endif From f0e7345fb8513afea09a7b848508f4800a225a9a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 May 2021 22:02:34 +0200 Subject: [PATCH 030/108] Add shortcut for small-size gemv_n with increments of one --- interface/gemv.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/interface/gemv.c b/interface/gemv.c index d5d739fb1..b6c2e6095 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -202,6 +202,11 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; + if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { + GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); From d6d7a6685dc5189b43519bb0d5a5fba52b4b0955 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 May 2021 22:39:18 +0200 Subject: [PATCH 031/108] Add shortcuts for (small) cases that do not need expensive buffer allocation --- interface/ger.c | 5 +++++ interface/spr.c | 20 ++++++++++++++++++++ interface/spr2.c | 18 ++++++++++++++++++ interface/symv.c | 4 ++++ interface/syr2.c | 19 +++++++++++++++++++ interface/zsyr.c | 26 ++++++++++++++++++++++++++ 6 files changed, 92 insertions(+) diff --git a/interface/ger.c b/interface/ger.c index 8cf1614e3..1c72d51ec 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -164,6 +164,11 @@ void CNAME(enum CBLAS_ORDER order, if (m == 0 || n == 0) return; if (alpha == 0.) return; + if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) { + GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/spr.c b/interface/spr.c index 1956986e9..8aafc9f85 100644 --- a/interface/spr.c +++ b/interface/spr.c @@ -167,6 +167,26 @@ void CNAME(enum CBLAS_ORDER order, FUNCTION_PROFILE_START(); + if (incx == 1 && n <100) { + blasint i; + if (uplo==0) { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0); + } + a += i + 1; + } + } else { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0); + } + a += n - i; + } + } + return; + } + if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); diff --git a/interface/spr2.c b/interface/spr2.c index 73a811c3e..b5aab1767 100644 --- a/interface/spr2.c +++ b/interface/spr2.c @@ -168,6 +168,24 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; + if (incx == 1 && incy == 1 && n < 50) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0); + a += i + 1; + } + } else { + for (i = 0; i < n; i++){ + AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0); + AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0); + a += n - i; + } + } + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/symv.c b/interface/symv.c index 07bd20022..de2b91ee4 100644 --- a/interface/symv.c +++ b/interface/symv.c @@ -170,6 +170,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, if (alpha == ZERO) return; + if (incx == 1 && incy == 1 && n*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { + (symv[uplo])(n, n, alpha, a, lda, x, incx, y, incy, buffer); + return; + } IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/syr2.c b/interface/syr2.c index 08fd47e57..632906d28 100644 --- a/interface/syr2.c +++ b/interface/syr2.c @@ -170,6 +170,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, IDEBUG_START; + if (incx == 1 && incy == 1 && n < 100) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0); + a += lda; + } + } else { + for (i = 0; i < n; i++){ + AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0); + AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0); + a += 1 + lda; + } + } + return; + } + + FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; diff --git a/interface/zsyr.c b/interface/zsyr.c index 09b1de578..b68237c93 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -172,6 +172,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + if (incx == 1 && incy == 1 && n < 50) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], + alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], + x, 1, a, 1, NULL, 0); + } + a += lda; + } + } else { + for (i = 0; i < n; i++){ + if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { + AXPYU_K(m - i, 0, 0, + alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], + alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], + x + i * 2, 1, a, 1, NULL, 0); + } + a += 2 + lda; + } + } + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); From 1217eb910d2da2e8ce47ef62fd3543c6345a3923 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 28 May 2021 09:38:48 +0200 Subject: [PATCH 032/108] Fix copy-paste errors in variables used --- interface/zsyr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/zsyr.c b/interface/zsyr.c index b68237c93..71d4dbf29 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -172,7 +172,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; - if (incx == 1 && incy == 1 && n < 50) { + if (incx == 1 && n < 50) { blasint i; if (!uplo) { for (i = 0; i < n; i++){ @@ -187,7 +187,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO } else { for (i = 0; i < n; i++){ if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { - AXPYU_K(m - i, 0, 0, + AXPYU_K(n - i, 0, 0, alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], x + i * 2, 1, a, 1, NULL, 0); From 734bd265a8b1c80f8fc078ad93fad817bdc9c08e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 May 2021 15:40:03 +0200 Subject: [PATCH 033/108] revert symv changes for now --- interface/symv.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/interface/symv.c b/interface/symv.c index de2b91ee4..07bd20022 100644 --- a/interface/symv.c +++ b/interface/symv.c @@ -170,10 +170,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, if (alpha == ZERO) return; - if (incx == 1 && incy == 1 && n*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { - (symv[uplo])(n, n, alpha, a, lda, x, incx, y, incy, buffer); - return; - } IDEBUG_START; FUNCTION_PROFILE_START(); From f84197c1a731889495f282be1d7089deedc83081 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 May 2021 22:28:00 +0200 Subject: [PATCH 034/108] Add shortcuts for (small) cases that do not need expensive buffer allocation --- interface/trsv.c | 6 ++++++ interface/ztrsv.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/interface/trsv.c b/interface/trsv.c index a054d8eeb..6a6e8f8ba 100644 --- a/interface/trsv.c +++ b/interface/trsv.c @@ -188,6 +188,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (n == 0) return; + if (incx == 1 && trans == 0 && n < 50) { + buffer = NULL; + (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/ztrsv.c b/interface/ztrsv.c index cbb7bba13..cf750b0b0 100644 --- a/interface/ztrsv.c +++ b/interface/ztrsv.c @@ -199,6 +199,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (n == 0) return; + if (incx == 1 && trans == 0 && n < 50) { + buffer = NULL; + (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); From 8c25b440a05d549f40b6a8af68288cf8aa7869f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Jun 2021 19:17:36 +0200 Subject: [PATCH 035/108] revert "try to work around gcc update problems" ...as homebrew has dropped at least gcc8 now --- .github/workflows/nightly-Homebrew-build.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index b025f8634..29ec96f73 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -43,11 +43,6 @@ jobs: - name: Update Homebrew if: github.event_name != 'pull_request' run: brew update || true - - - name: unlink installed gcc to allow updating - run: | - brew unlink gcc@8 - brew unlink gcc@9 - name: Install prerequisites run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas From 1e0192a5ccac28fc0c749f49d36ec7eda9757428 Mon Sep 17 00:00:00 2001 From: Zhaofeng Li Date: Mon, 7 Jun 2021 22:49:39 +0000 Subject: [PATCH 036/108] riscv64/imin: Fix wrong comparison Same as #1990. --- kernel/riscv64/imin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c index 598cba387..ffc65226e 100644 --- a/kernel/riscv64/imin.c +++ b/kernel/riscv64/imin.c @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; From 3521cd48cbfb3d50f6ae9a10377382d37075c696 Mon Sep 17 00:00:00 2001 From: Zhaofeng Li Date: Mon, 7 Jun 2021 22:50:23 +0000 Subject: [PATCH 037/108] RISCV64_GENERIC: Use generic kernel for DSDOT for better precision The implementation in `riscv64/dot.c` fails the `test_dsdot` test, and the generic kernel seems to have better precision. Tested on SiFive FU740 (HiFive Unmatched) and QEMU. Also see #1469. --- kernel/riscv64/KERNEL.RISCV64_GENERIC | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index ea6a8cf21..61a8a2b91 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -54,6 +54,7 @@ SDOTKERNEL = ../riscv64/dot.c DDOTKERNEL = ../riscv64/dot.c CDOTKERNEL = ../riscv64/zdot.c ZDOTKERNEL = ../riscv64/zdot.c +DSDOTKERNEL = ../generic/dot.c SNRM2KERNEL = ../riscv64/nrm2.c DNRM2KERNEL = ../riscv64/nrm2.c From 590be3fae35d134fae156c60dc3a21d7933f8914 Mon Sep 17 00:00:00 2001 From: Zhaofeng Li Date: Mon, 7 Jun 2021 22:55:56 +0000 Subject: [PATCH 038/108] riscv64: Add Makefile --- kernel/riscv64/Makefile | 1 + 1 file changed, 1 insertion(+) create mode 100644 kernel/riscv64/Makefile diff --git a/kernel/riscv64/Makefile b/kernel/riscv64/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/riscv64/Makefile @@ -0,0 +1 @@ +clean :: From 706a08d4a01b28bc6445193dbf385260047cd0b6 Mon Sep 17 00:00:00 2001 From: "Ma, Yu" Date: Tue, 8 Jun 2021 15:08:28 -0400 Subject: [PATCH 039/108] Optimized sgemv_t for small N based on AVX512 --- kernel/x86_64/sgemv_t_4.c | 36 +- kernel/x86_64/sgemv_t_microk_skylakex.c | 60 + .../x86_64/sgemv_t_microk_skylakex_template.c | 1120 +++++++++++++++++ 3 files changed, 1215 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemv_t_microk_skylakex.c create mode 100644 kernel/x86_64/sgemv_t_microk_skylakex_template.c diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index fe886f57f..a36c8ace9 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,8 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) #include "sgemv_t_microk_haswell-4.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#include "sgemv_t_microk_haswell-4.c" +#include "sgemv_t_microk_skylakex.c" #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) @@ -305,6 +308,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( m < 1 ) return(0); if ( n < 1 ) return(0); + #ifdef HAVE_SGEMV_T_SKYLAKE_KERNEL + if (lda == m && n <= 16384 && m <= 8) + { + FLOAT * xbuffer_align = x; + FLOAT * ybuffer_align = y; + + if (inc_x != 1) { + xbuffer_align = buffer; + for(BLASLONG i=0; i= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_SGEMV_T_SKYLAKE_KERNEL 1 +#include "common.h" +#include +#include "sgemv_t_microk_skylakex_template.c" + +//sgemv_t: +// ----- m ----- +// |<----------- +// |<----------- +// n +// |<----------- +// |<----------- + +static int sgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, float *a, float *x, float *y) +{ + switch(m) { + case 1: sgemv_kernel_t_1(n, alpha, a, x, y); break; + case 2: sgemv_kernel_t_2(n, alpha, a, x, y); break; + case 3: sgemv_kernel_t_3(n, alpha, a, x, y); break; + case 4: sgemv_kernel_t_4(n, alpha, a, x, y); break; + case 5: sgemv_kernel_t_5(n, alpha, a, x, y); break; + case 6: sgemv_kernel_t_6(n, alpha, a, x, y); break; + case 7: sgemv_kernel_t_7(n, alpha, a, x, y); break; + case 8: sgemv_kernel_t_8(n, alpha, a, x, y); break; + default: break; + } + return 0; +} + +#endif diff --git a/kernel/x86_64/sgemv_t_microk_skylakex_template.c b/kernel/x86_64/sgemv_t_microk_skylakex_template.c new file mode 100644 index 000000000..34415054c --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_skylakex_template.c @@ -0,0 +1,1120 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include "common.h" + +//Here the m means n in sgemv_t: +// ----- n ----- +// | +// | +// m +// | +// | +static int sgemv_kernel_t_1(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + //printf("enter into t_1 kernel\n"); + //printf("m = %ld\n", m); + __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + float alphaX = alpha * (*x); + __m512 ALPHAXVECTOR = _mm512_set1_ps(alphaX); + + BLASLONG tag_m_128x = m & (~127); + BLASLONG tag_m_64x = m & (~63); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_m + 48]); + matrixArray_4 = _mm512_loadu_ps(&a[idx_m + 64]); + matrixArray_5 = _mm512_loadu_ps(&a[idx_m + 80]); + matrixArray_6 = _mm512_loadu_ps(&a[idx_m + 96]); + matrixArray_7 = _mm512_loadu_ps(&a[idx_m + 112]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(matrixArray_2, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(matrixArray_3, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(matrixArray_4, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 64]))); + _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(matrixArray_5, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 80]))); + _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(matrixArray_6, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 96]))); + _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(matrixArray_7, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 112]))); + + } + + if (tag_m_128x != m) { + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_m + 48]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(matrixArray_2, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(matrixArray_3, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + + } + + if (tag_m_64x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + + } + + if (tag_m_32x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_16x; idx_m+=32) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + } + + if (tag_m_16x != m) { + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[tag_m_16x]); + + _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x]))); + + } + + + } + } + } + + return 0; +} + +static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + __m512 m0, m1, m2, m3, col0_1, col0_2, col1_1, col1_2, x1Array, x2Array; + float x1a = x[0] * alpha; + float x2a = x[1] * alpha; + x1Array = _mm512_set1_ps(x1a); + x2Array = _mm512_set1_ps(x2a); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1); + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + m0 = _mm512_loadu_ps(&a[idx_m*2]); + m1 = _mm512_loadu_ps(&a[idx_m*2 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*2 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*2 + 48]); + col0_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1); + col0_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1); + col1_1 = _mm512_permutex2var_ps(m2, idx_base_0, m3); + col1_2 = _mm512_permutex2var_ps(m2, idx_base_1, m3); + + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col0_2, _mm512_mul_ps(col0_1, x1Array)), _mm512_loadu_ps(&y[idx_m]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m + 16]))); + } + if (tag_m_32x != m) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m]); + m1 = _mm512_loadu_ps(&a[idx_m + 16]); + col1_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1); + col1_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1); + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m]))); + } + if (tag_m_16x != m) { + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); + unsigned char load_mask_value = (((unsigned char)0xff) >> 6); + __mmask8 load_mask = *((__mmask8*) &load_mask_value); + x1Array = _mm512_broadcast_f32x2(_mm_maskz_loadu_ps(load_mask, x)); + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m0 = _mm512_loadu_ps(&a[idx_m]); + m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); + m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); + __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); + _mm256_storeu_ps(&y[idx_m], _mm256_add_ps(ret, _mm256_loadu_ps(&y[idx_m]))); + + } + + if (tag_m_8x != m) { + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-((m-tag_m_8x)*2)&15)); + __mmask16 a_mask = *((__mmask16*) &tail_mask_value); + unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x))); + __mmask8 y_mask = *((__mmask8*) &y_mask_value); + + m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x]); + m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); + m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); + __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); + _mm256_mask_storeu_ps(&y[tag_m_8x], y_mask, _mm256_add_ps(ret, _mm256_maskz_loadu_ps(y_mask, &y[tag_m_8x]))); + } + } + } + return 0; +} + +static int sgemv_kernel_t_3(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + __m512 m0, m1, m2, c1, c2, c3, tmp, x1Array, x2Array, x3Array; + float x1a = x[0] * alpha; + float x2a = x[1] * alpha; + float x3a = x[2] * alpha; + x1Array = _mm512_set1_ps(x1a); + x2Array = _mm512_set1_ps(x2a); + x3Array = _mm512_set1_ps(x3a); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_s1 = _mm512_set1_epi32(-1); + __m512i idx_c1_1 = _mm512_set_epi32(0, 0, 0, 0, 0, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0); + __m512i idx_c2_1 = _mm512_add_epi32(idx_c1_1, M512_EPI32_1); + __m512i idx_c3_1 = _mm512_add_epi32(idx_c2_1, M512_EPI32_1); + + __m512i idx_c3_2 = _mm512_set_epi32(31, 28, 25, 22, 19, 16, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m512i idx_c2_2 = _mm512_add_epi32(idx_c3_2, M512_EPI32_s1); + __m512i idx_c1_2 = _mm512_add_epi32(idx_c2_2, M512_EPI32_s1); + + __mmask16 step_1 = 0x07ff; + __mmask16 step_2 = 0xf800; + __mmask16 c31 = 0x03ff; + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*3]); + m1 = _mm512_loadu_ps(&a[idx_m*3 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*3 + 32]); + + tmp = _mm512_mask_permutex2var_ps(m0, step_1, idx_c1_1, m1); + c1 = _mm512_mask_permutex2var_ps(tmp, step_2, idx_c1_2, m2); + tmp = _mm512_mask_permutex2var_ps(m0, step_1, idx_c2_1, m1); + c2 = _mm512_mask_permutex2var_ps(tmp, step_2, idx_c2_2, m2); + tmp = _mm512_mask_permutex2var_ps(m0, c31, idx_c3_1, m1); + c3 = _mm512_permutex2var_ps(tmp, idx_c3_2, m2); + + tmp = _mm512_fmadd_ps(x2Array, c2, _mm512_mul_ps(c1, x1Array)); + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x3Array, c3, tmp), _mm512_loadu_ps(&y[idx_m]))); + } + + if(tag_m_16x != m) { + __mmask8 a_mask = 0xff; + __m256i M256_EPI32_1 = _mm256_maskz_set1_epi32(a_mask, 1); + __m256i M256_EPI32_s1 = _mm256_maskz_set1_epi32(a_mask, -1); + __m256i idx_c1_1 = _mm256_set_epi32(0, 0, 15, 12, 9, 6, 3, 0); + __m256i idx_c2_1 = _mm256_add_epi32(idx_c1_1, M256_EPI32_1); + __m256i idx_c3_1 = _mm256_add_epi32(idx_c2_1, M256_EPI32_1); + + __m256i idx_c3_2 = _mm256_set_epi32(15, 12, 9, 0, 0, 0, 0, 0); + __m256i idx_c2_2 = _mm256_add_epi32(idx_c3_2, M256_EPI32_s1); + __m256i idx_c1_2 = _mm256_add_epi32(idx_c2_2, M256_EPI32_s1); + + __mmask8 step_1 = 0x1f; + __mmask8 step_2 = 0xe0; + __mmask8 c12 = 0xc0; + + __m256 m256_0, m256_1, m256_2, tmp256, c256_1, c256_2, c256_3, x256_1, x256_2, x256_3; + x256_1 = _mm256_set1_ps(x1a); + x256_2 = _mm256_set1_ps(x2a); + x256_3 = _mm256_set1_ps(x3a); + + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m256_0 = _mm256_loadu_ps(&a[idx_m*3]); + m256_1 = _mm256_loadu_ps(&a[idx_m*3 + 8]); + m256_2 = _mm256_loadu_ps(&a[idx_m*3 + 16]); + + tmp256 = _mm256_permutex2var_ps(m256_0, idx_c1_1, m256_1); + c256_1 = _mm256_mask_permutex2var_ps(tmp256, c12, idx_c1_2, m256_2); + tmp256 = _mm256_mask_permutex2var_ps(m256_0, step_1, idx_c2_1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(tmp256, step_2, idx_c2_2, m256_2); + tmp256 = _mm256_mask_permutex2var_ps(m256_0, step_1, idx_c3_1, m256_1); + c256_3 = _mm256_mask_permutex2var_ps(tmp256, step_2, idx_c3_2, m256_2); + + tmp256 = _mm256_fmadd_ps(x256_2, c256_2, _mm256_mul_ps(c256_1, x256_1)); + _mm256_storeu_ps(&y[idx_m], _mm256_maskz_add_ps(a_mask, _mm256_fmadd_ps(x256_3, c256_3, tmp256), _mm256_loadu_ps(&y[idx_m]))); + } + + if(tag_m_8x != m){ + for (BLASLONG idx_m = tag_m_8x; idx_m < tag_m_4x; idx_m+=4){ + m0 = _mm512_maskz_loadu_ps(0x0fff, &a[tag_m_8x*3]); + m256_0 = _mm512_extractf32x8_ps(m0, 0); + m256_1 = _mm512_extractf32x8_ps(m0, 1); + __m256i idx1 = _mm256_set_epi32(10, 7, 4, 1, 9, 6, 3, 0); + __m256i M256_EPI32_2 = _mm256_maskz_set1_epi32(0x0f, 2); + __m256i idx2 = _mm256_add_epi32(idx1, M256_EPI32_2); + + c256_1 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(m256_0, 0x0f, idx2, m256_1); + + __m128 c128_1 = _mm256_extractf32x4_ps(c256_1, 0); + __m128 c128_2 = _mm256_extractf32x4_ps(c256_1, 1); + __m128 c128_3 = _mm256_extractf32x4_ps(c256_2, 0); + + __m128 x128_1 = _mm_set1_ps(x1a); + __m128 x128_2 = _mm_set1_ps(x2a); + __m128 x128_3 = _mm_set1_ps(x3a); + + __m128 tmp128 = _mm_maskz_fmadd_ps(0x0f, c128_1, x128_1, _mm_maskz_mul_ps(0x0f, c128_2, x128_2)); + _mm_mask_storeu_ps(&y[idx_m], 0x0f, _mm_maskz_add_ps(0x0f, _mm_maskz_fmadd_ps(0x0f, c128_3, x128_3, tmp128), _mm_maskz_loadu_ps(0x0f, &y[idx_m]))); + } + + if(tag_m_4x != m) { + for (BLASLONG idx_m = tag_m_4x; idx_m < tag_m_2x; idx_m+=2) { + m256_0 = _mm256_maskz_loadu_ps(0x3f, &a[idx_m*3]); + __m128 a128_1 = _mm256_extractf32x4_ps(m256_0, 0); + __m128 a128_2 = _mm256_extractf32x4_ps(m256_0, 1); + __m128 x128 = _mm_maskz_loadu_ps(0x07, x); + + __m128i idx128_1= _mm_set_epi32(0, 2, 1, 0); + __m128i M128_EPI32_3 = _mm_maskz_set1_epi32(0x07, 3); + __m128i idx128_2 = _mm_add_epi32(idx128_1, M128_EPI32_3); + + __m128 c128_1 = _mm_maskz_permutex2var_ps(0x07, a128_1, idx128_1, a128_2); + __m128 c128_2 = _mm_maskz_permutex2var_ps(0x07, a128_1, idx128_2, a128_2); + + __m128 tmp128 = _mm_hadd_ps(_mm_maskz_mul_ps(0x07, c128_1, x128), _mm_maskz_mul_ps(0x07, c128_2, x128)); + float ret[4]; + _mm_mask_storeu_ps(ret, 0x0f, tmp128); + y[idx_m] += alpha *(ret[0] + ret[1]); + y[idx_m+1] += alpha * (ret[2] + ret[3]); + } + + if(tag_m_2x != m) { + y[tag_m_2x] += alpha*(a[tag_m_2x*3]*x[0] + a[tag_m_2x*3+1]*x[1] + a[tag_m_2x*3+2]*x[2]); + } + } + } + } + + return 0; +} + +static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + __m512 m0, m1, m2; + __m256 m256_0, m256_1, c256_1, c256_2; + __m128 c1, c2, c3, c4, ret; + __m128 xarray = _mm_maskz_loadu_ps(0x0f, x); + __m512 x512 = _mm512_broadcast_f32x4(xarray); + __m512 alphavector = _mm512_set1_ps(alpha); + __m512 xa512 = _mm512_mul_ps(x512, alphavector); + __m256i idx1 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0); + __m256i idx2 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2); + + + for (BLASLONG idx_m = 0; idx_m < tag_m_4x; idx_m+=4) { + m0 = _mm512_loadu_ps(&a[idx_m*4]); + m1 = _mm512_mul_ps(m0, xa512); + m256_0 = _mm512_extractf32x8_ps(m1, 0); + m256_1 = _mm512_extractf32x8_ps(m1, 1); + c256_1 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx2, m256_1); + + c1 = _mm256_extractf32x4_ps(c256_1, 0); + c2 = _mm256_extractf32x4_ps(c256_1, 1); + c3 = _mm256_extractf32x4_ps(c256_2, 0); + c4 = _mm256_extractf32x4_ps(c256_2, 1); + + ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, y)); + _mm_mask_storeu_ps(&y[idx_m], 0xff, ret); + } + + if(tag_m_4x != m) { + float result[4]; + for(BLASLONG idx_m=tag_m_4x; idx_m < tag_m_2x; idx_m+=2) { + m256_0 = _mm256_maskz_loadu_ps(0xff, &a[idx_m*4]); + c1 = _mm256_maskz_extractf32x4_ps(0xff, m256_0, 0); + c2 = _mm256_maskz_extractf32x4_ps(0xff, m256_0, 1); + + c3 = _mm_maskz_mul_ps(0x0f, c1, xarray); + c4 = _mm_maskz_mul_ps(0x0f, c2, xarray); + + ret = _mm_hadd_ps(c3, c4); + _mm_mask_storeu_ps(result, 0x0f, ret); + y[idx_m] += alpha *(result[0] + result[1]); + y[idx_m+1] += alpha * (result[2] + result[3]); + } + + if(tag_m_2x != m ) { + c1 = _mm_maskz_loadu_ps(0x0f, &a[tag_m_2x * 4]); + c2 = _mm_maskz_mul_ps(0x0f, c1, xarray); + _mm_mask_storeu_ps(result, 0x0f, c2); + y[tag_m_2x] += alpha *(result[0] + result[1] + result[2] + result[3]); + } + } + + return 0; +} + +static int sgemv_kernel_t_5(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + __m512 m0, m1, m2, m3, m4, tmp0, tmp1, tmp2, accum, c0, c1, c2, c3, c4; + __m512 x0_512 = _mm512_set1_ps(x[0]); + __m512 x1_512 = _mm512_set1_ps(x[1]); + __m512 x2_512 = _mm512_set1_ps(x[2]); + __m512 x3_512 = _mm512_set1_ps(x[3]); + __m512 x4_512 = _mm512_set1_ps(x[4]); + __m512 alpha_512 = _mm512_set1_ps(alpha); + + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_16 = _mm512_set1_epi32(16); + __m512i M512_EPI32_0 = _mm512_setzero_epi32(); + + __m512i idx_c0 = _mm512_set_epi32(27, 22, 17, 28, 23, 18, 13, 8, 3, 30, 25, 20, 15, 10, 5, 0); + __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1); + __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1); + idx_c2 = _mm512_mask_blend_epi32(0x0040, idx_c2, M512_EPI32_0); + __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1); + __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1); + idx_c4 = _mm512_mask_blend_epi32(0x1000, idx_c4, M512_EPI32_16); + + for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*5]); + m1 = _mm512_loadu_ps(&a[idx_m*5 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*5 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*5 + 48]); + m4 = _mm512_loadu_ps(&a[idx_m*5 + 64]); + + tmp0 = _mm512_maskz_permutex2var_ps(0x007f, m0, idx_c0, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1f80, m2, idx_c0, m3); + c0 = _mm512_mask_blend_ps(0x1f80, tmp0, tmp1); + c0 = _mm512_mask_permutex2var_ps(c0, 0xe000, idx_c0, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x007f, m0, idx_c1, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1f80, m2, idx_c1, m3); + c1 = _mm512_mask_blend_ps(0x1f80, tmp0, tmp1); + c1 = _mm512_mask_permutex2var_ps(c1, 0xe000, idx_c1, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c2, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1fc0, m2, idx_c2, m3); + c2 = _mm512_mask_blend_ps(0x1fc0, tmp0, tmp1); + c2 = _mm512_mask_permutex2var_ps(c2, 0xe000, idx_c2, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c3, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1fc0, m2, idx_c3, m3); + c3 = _mm512_mask_blend_ps(0x1fc0, tmp0, tmp1); + c3 = _mm512_mask_permutex2var_ps(c3, 0xe000, idx_c3, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c4, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x0fc0, m2, idx_c4, m3); + c4 = _mm512_mask_blend_ps(0x0fc0, tmp0, tmp1); + c4 = _mm512_mask_permutex2var_ps(c4, 0xf000, idx_c4, m4); + + accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512)); + accum = _mm512_fmadd_ps(c2, x2_512, accum); + accum = _mm512_fmadd_ps(c3, x3_512, accum); + accum = _mm512_fmadd_ps(c4, x4_512, accum); + accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m])); + _mm512_storeu_ps(&y[idx_m], accum); + + } + if(tag_m_16x !=m) { + __m512i idx_c0c2 = _mm512_set_epi32(0, 0, 27, 22, 17, 12, 7, 2 , 0, 30, 25, 20, 15, 10, 5, 0); + __m512i idx_c1c3 = _mm512_add_epi32(idx_c0c2, M512_EPI32_1); + idx_c4 = _mm512_add_epi32(idx_c1c3, M512_EPI32_1); + __m256i idx_c0m4 = _mm256_set_epi32(11, 6, 0, 0, 0, 0, 0, 0); + __m256i M256_EPI32_1 = _mm256_set1_epi32(1); + __m256i idx_c1m4 = _mm256_add_epi32(idx_c0m4, M256_EPI32_1); + __m256i idx_c2m4 = _mm256_add_epi32(idx_c1m4, M256_EPI32_1); + __m256i idx_c3m4 = _mm256_add_epi32(idx_c2m4, M256_EPI32_1); + __m256i idx_c4m4 = _mm256_add_epi32(idx_c3m4, M256_EPI32_1); + //TODO: below can change to use extract to decrease the latency + __m256 x0_256 = _mm256_set1_ps(x[0]); + __m256 x1_256 = _mm256_set1_ps(x[1]); + __m256 x2_256 = _mm256_set1_ps(x[2]); + __m256 x3_256 = _mm256_set1_ps(x[3]); + __m256 x4_256 = _mm256_set1_ps(x[4]); + __m256 alpha256 = _mm256_set1_ps(alpha); + __m256 accum_256, m256_4; + + for(BLASLONG idx_m=tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m0 = _mm512_loadu_ps(&a[idx_m*5]); + m1 = _mm512_loadu_ps(&a[idx_m*5 + 16]); + m256_4 = _mm256_loadu_ps(&a[idx_m*5 + 32]); + tmp0 = _mm512_permutex2var_ps(m0, idx_c0c2, m1); + tmp1 = _mm512_permutex2var_ps(m0, idx_c1c3, m1); + tmp2 = _mm512_permutex2var_ps(m0, idx_c4, m1); + + __m256 c256_0 = _mm512_extractf32x8_ps(tmp0, 0); + __m256 c256_2 = _mm512_extractf32x8_ps(tmp0, 1); + __m256 c256_1 = _mm512_extractf32x8_ps(tmp1, 0); + __m256 c256_3 = _mm512_extractf32x8_ps(tmp1, 1); + __m256 c256_4 = _mm512_extractf32x8_ps(tmp2, 1); + + c256_0 = _mm256_mask_permutex2var_ps(c256_0, 0x80, idx_c0m4, m256_4); + c256_1 = _mm256_mask_permutex2var_ps(c256_1, 0x80, idx_c1m4, m256_4); + c256_2 = _mm256_mask_permutex2var_ps(c256_2, 0xc0, idx_c2m4, m256_4); + c256_3 = _mm256_mask_permutex2var_ps(c256_3, 0xc0, idx_c3m4, m256_4); + c256_4 = _mm256_mask_permutex2var_ps(c256_4, 0xc0, idx_c4m4, m256_4); + + accum_256 = _mm256_fmadd_ps(c256_1, x1_256, _mm256_mul_ps(c256_0, x0_256)); + accum_256 = _mm256_fmadd_ps(c256_2, x2_256, accum_256); + accum_256 = _mm256_fmadd_ps(c256_3, x3_256, accum_256); + accum_256 = _mm256_fmadd_ps(c256_4, x4_256, accum_256); + accum_256 = _mm256_fmadd_ps(accum_256, alpha256, _mm256_loadu_ps(&y[idx_m])); + _mm256_storeu_ps(&y[idx_m], accum_256); + } + if(tag_m_8x != m) { + __m256i idx_c02 = _mm256_set_epi32(17, 12, 7, 2, 15, 10, 5, 0); + __m256i idx_c13 = _mm256_add_epi32(idx_c02, M256_EPI32_1); + __m256i idx_4 = _mm256_add_epi32(idx_c13, M256_EPI32_1); + __m128 accum_128; + __m256 m256_0, m256_1, tmp256_0, tmp256_1; + for (BLASLONG idx_m = tag_m_8x; idx_m < tag_m_4x; idx_m+=4){ + m256_0 = _mm256_loadu_ps(&a[idx_m*5]); + m256_1 = _mm256_loadu_ps(&a[idx_m*5 + 8]); + __m128 m128_4 = _mm_maskz_loadu_ps(0x0f, &a[idx_m*5 + 16]); + + tmp256_0 = _mm256_permutex2var_ps(m256_0, idx_c02, m256_1); + tmp256_1 = _mm256_permutex2var_ps(m256_0, idx_c13, m256_1); + __m256 tmp256_2 = _mm256_maskz_permutex2var_ps(0xf0, m256_0, idx_4, m256_1); + + __m128 c128_0 = _mm256_extractf32x4_ps(tmp256_0, 0); + __m128 c128_1 = _mm256_extractf32x4_ps(tmp256_1, 0); + __m128 c128_2 = _mm256_extractf32x4_ps(tmp256_0, 1); + __m128 c128_3 = _mm256_extractf32x4_ps(tmp256_1, 1); + __m128 c128_4 = _mm256_extractf32x4_ps(tmp256_2, 1); + + __m128i idx_c14 = _mm_set_epi32(4, 0, 0, 0); + __m128i M128_EPI32_1 = _mm_set1_epi32(1); + __m128i idx_c24 = _mm_add_epi32(idx_c14, M128_EPI32_1); + __m128i idx_c34 = _mm_add_epi32(idx_c24, M128_EPI32_1); + __m128i idx_c44 = _mm_add_epi32(idx_c34, M128_EPI32_1); + + c128_1 = _mm_mask_permutex2var_ps(c128_1, 0x08, idx_c14, m128_4); + c128_2 = _mm_mask_permutex2var_ps(c128_2, 0x08, idx_c24, m128_4); + c128_3 = _mm_mask_permutex2var_ps(c128_3, 0x08, idx_c34, m128_4); + c128_4 = _mm_mask_permutex2var_ps(c128_4, 0x08, idx_c44, m128_4); + + __m128 x128_0 = _mm256_extractf32x4_ps(x0_256, 0); + __m128 x128_1 = _mm256_extractf32x4_ps(x1_256, 0); + __m128 x128_2 = _mm256_extractf32x4_ps(x2_256, 0); + __m128 x128_3 = _mm256_extractf32x4_ps(x3_256, 0); + __m128 x128_4 = _mm256_extractf32x4_ps(x4_256, 0); + + __m128 alpha_128 = _mm256_extractf32x4_ps(alpha256, 0); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_1, x128_1, _mm_maskz_mul_ps(0x0f, c128_0, x128_0)); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_2, x128_2, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_3, x128_3, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_4, x128_4, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, accum_128, alpha_128, _mm_maskz_loadu_ps(0x0f, &y[idx_m])); + _mm_mask_storeu_ps(&y[idx_m], 0x0f, accum_128); + + } + + if(tag_m_4x !=m ){ + x0_256 = _mm256_maskz_loadu_ps(0x1f, x); + x0_256 = _mm256_mul_ps(x0_256, alpha256); + float ret8[8]; + + for(BLASLONG idx_m = tag_m_4x; idx_m < tag_m_2x; idx_m+=2){ + m256_0 = _mm256_maskz_loadu_ps(0x1f, &a[idx_m*5]); + m256_1 = _mm256_maskz_loadu_ps(0x1f, &a[idx_m*5 + 5]); + + m256_0 = _mm256_mul_ps(m256_0, x0_256); + m256_1 = _mm256_mul_ps(m256_1, x0_256); + + _mm256_mask_storeu_ps(ret8, 0x1f, m256_0); + y[idx_m] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + _mm256_mask_storeu_ps(ret8, 0x1f, m256_1); + y[idx_m+1] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + + } + + if(tag_m_2x != m){ + m256_0 = _mm256_maskz_loadu_ps(0x1f, &a[tag_m_2x*5]); + m256_0 = _mm256_mul_ps(m256_0, x0_256); + + + _mm256_mask_storeu_ps(ret8, 0x1f, m256_0); + y[tag_m_2x] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + + } + } + } + + } + return 0; +} + +static int sgemv_kernel_t_6(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + + __m512 m0, m1, m2, m3, m4, m5, c0, c1, c2, c3, c4, c5, tmp0, tmp1, tmp2, accum; + __m512i idx_c0 = _mm512_set_epi32(26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0); + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_0 = _mm512_setzero_epi32(); + __m512i M512_EPI32_16 = _mm512_set1_epi32(16); + __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1); + __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1); + idx_c2 = _mm512_mask_blend_epi32(0x0020, idx_c2, M512_EPI32_0); + __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1); + __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1); + idx_c4 = _mm512_mask_blend_epi32(0x0400, idx_c4, M512_EPI32_0); + __m512i idx_c5 = _mm512_add_epi32(idx_c4, M512_EPI32_1); + + __m512 x0_512 = _mm512_set1_ps(x[0]); + __m512 x1_512 = _mm512_set1_ps(x[1]); + __m512 x2_512 = _mm512_set1_ps(x[2]); + __m512 x3_512 = _mm512_set1_ps(x[3]); + __m512 x4_512 = _mm512_set1_ps(x[4]); + __m512 x5_512 = _mm512_set1_ps(x[5]); + __m512 alpha_512 = _mm512_set1_ps(alpha); + + for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*6]); + m1 = _mm512_loadu_ps(&a[idx_m*6 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*6 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*6 + 48]); + m4 = _mm512_loadu_ps(&a[idx_m*6 + 64]); + m5 = _mm512_loadu_ps(&a[idx_m*6 + 80]); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c0, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07c0, m2, idx_c0, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c0, m5); + c0 = _mm512_mask_blend_ps(0x07c0, tmp0, tmp1); + c0 = _mm512_mask_blend_ps(0xf800, c0, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c1, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07c0, m2, idx_c1, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c1, m5); + c1 = _mm512_mask_blend_ps(0x07c0, tmp0, tmp1); + c1 = _mm512_mask_blend_ps(0xf800, c1, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c2, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07e0, m2, idx_c2, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c2, m5); + c2 = _mm512_mask_blend_ps(0x07e0, tmp0, tmp1); + c2 = _mm512_mask_blend_ps(0xf800, c2, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c3, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07e0, m2, idx_c3, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c3, m5); + c3 = _mm512_mask_blend_ps(0x07e0, tmp0, tmp1); + c3 = _mm512_mask_blend_ps(0xf800, c3, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c4, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c4, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xfc00, m4, idx_c4, m5); + c4 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1); + c4 = _mm512_mask_blend_ps(0xfc00, c4, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c5 , m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c5 , m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xfc00, m4, idx_c5 , m5); + c5 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1); + c5 = _mm512_mask_blend_ps(0xfc00, c5, tmp2); + + accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512)); + accum = _mm512_fmadd_ps(c2, x2_512, accum); + accum = _mm512_fmadd_ps(c3, x3_512, accum); + accum = _mm512_fmadd_ps(c4, x4_512, accum); + accum = _mm512_fmadd_ps(c5, x5_512, accum); + accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m])); + _mm512_storeu_ps(&y[idx_m], accum); + } + + if(tag_m_16x != m) { + __m512i idx_c0c3 = _mm512_set_epi32(29, 23, 17, 27, 21, 15, 9, 3, 26, 20, 30, 24, 18, 12, 6, 0); + __m512i idx_c1c4 = _mm512_add_epi32(idx_c0c3, M512_EPI32_1); + __m512i idx_c2c5 = _mm512_add_epi32(idx_c1c4, M512_EPI32_1); + idx_c2c5 = _mm512_mask_blend_epi32(0x0020, idx_c2c5, M512_EPI32_16); + __m256 c256_0, c256_1, c256_2, c256_3, c256_4, c256_5; + + __m256 x0_256 = _mm256_set1_ps(x[0]); + __m256 x1_256 = _mm256_set1_ps(x[1]); + __m256 x2_256 = _mm256_set1_ps(x[2]); + __m256 x3_256 = _mm256_set1_ps(x[3]); + __m256 x4_256 = _mm256_set1_ps(x[4]); + __m256 x5_256 = _mm256_set1_ps(x[5]); + __m256 alpha256 = _mm256_set1_ps(alpha); + __m256 accum_256; + + for(BLASLONG idx_m = tag_m_16x; idx_m Date: Wed, 9 Jun 2021 12:20:09 -0500 Subject: [PATCH 040/108] POWER10: Fixes for sbgemm kernel While testing bfloat16 sbgemm kernel, there are some failures for odd value inputs due to array access beyond the boundary. --- kernel/power/sbgemm_kernel_power10.c | 136 ++++++++++++++------------- 1 file changed, 71 insertions(+), 65 deletions(-) diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index d15586703..74f3eac4c 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -49,17 +49,11 @@ typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); -vector char mask = - { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, - 0xf -}; - /* * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of * bfloat16 floating-point values as input. Hence this * merging is needed on A and B matrices. */ -#define MERGE_ROW(x) vec_perm(x, x, mask) #define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y) #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) @@ -179,8 +173,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 4; vec_t *rowA = (vec_t *) & (AO[l << 1]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowB_h = MERGE_HIGH (rowB[0], vzero); + vec_t rowB_l = MERGE_LOW (rowB[0], vzero); vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); vec_t rowA_l = MERGE_LOW (rowA[0], vzero); vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero); @@ -231,8 +225,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 4; vec_t *rowA = (vec_t *) & (AO[l]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowB_h = MERGE_HIGH (rowB[0], vzero); + vec_t rowB_l = MERGE_LOW (rowB[0], vzero); vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); vec_t rowA_l = MERGE_LOW (rowA[0], vzero); MMA (&acc0, rowB_h, rowA_h); @@ -271,8 +265,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; vec_t *rowB = (vec_t *) & (BO[l << 1]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } SAVE_ACC (&acc0, 0); SAVE_ACC1 (&acc1, 0); @@ -306,8 +300,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[(l << 2)]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); SAVE4x2_ACC1 (&acc1, 0); @@ -338,8 +332,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[(l << 3)]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); SAVE4x2_ACC1 (&acc1, 0); @@ -387,16 +381,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]); - vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); - MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); - MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero)); - MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); - MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, (vec_t)rowB_mrg, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, (vec_t)rowB_mrg, MERGE_LOW (rowA1[1], vzero)); } SAVE_ACC (&acc0, 0); @@ -436,12 +430,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero)); } SAVE_ACC (&acc0, 0); @@ -475,9 +469,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[l << 1]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); } SAVE_ACC (&acc0, 0); SAVE_ACC (&acc1, 4); @@ -505,8 +500,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vector short rowA = { AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; - vec_t *rowB = (vec_t *) & (BO[l]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } SAVE_ACC (&acc0, 0); CO += 4; @@ -536,8 +532,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 2; vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 }; - vec_t *rowB = (vec_t *) & (BO[l << 1]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[(l<<1)], 0, BO[(l<<1) + 1], 0, BO[(l<<1) + 2], 0, + BO[(l<<1) + 3], 0 + }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); CO += 2; @@ -566,8 +565,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 1; vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; - vec_t *rowB = (vec_t *) & (BO[l << 2]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[(l<<2) + 0], 0, BO[(l<<2) + 1], 0, BO[(l <<2) + 2], 0, + BO[(l<<2) + 3], 0 + }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); AO += k; @@ -620,14 +622,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 3]); vec_t *rowA1 = (vec_t *) & (A1[l << 3]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); - MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); - MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); - MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); - MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -669,10 +671,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 3]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero )); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -708,8 +710,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -740,8 +742,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; - vec_t *rowA = (vec_t *) & (AO[l << 1]); - MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + vector short rowA = + { AO[(l << 1)], 0, AO[(l << 1) + 1] , 0 , AO[(l<<1) + 2], + 0, AO[(l << 1) + 3], 0 }; + MMA (&acc0, (vec_t) rowB, (vec_t)(rowA)); } SAVE2x4_ACC (&acc0, 0); CO += 4; @@ -829,10 +833,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 4)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); @@ -871,8 +875,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 3)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); @@ -904,8 +908,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; - vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + vector short rowA = + { AO[(l << 2)], 0, AO[(l << 2) + 1] , 0 , + AO[(l << 2) + 2], 0, AO[(l << 2) + 3], 0 }; + MMA (&acc0, (vec_t) rowB, (vec_t)(rowA)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); From 7fb6e576c254864f1b8990655dcc28b524f23c2f Mon Sep 17 00:00:00 2001 From: Arthur Williams Date: Wed, 9 Jun 2021 20:50:11 -0500 Subject: [PATCH 041/108] Removed use of non portable '-p' arg to install Not all versions of install support '-p' flag and it isn't worth failing the build in the installed files' timestamps get updated. --- Makefile.install | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile.install b/Makefile.install index e8b64465f..28727de37 100644 --- a/Makefile.install +++ b/Makefile.install @@ -74,17 +74,17 @@ endif ifneq ($(OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif @@ -92,7 +92,7 @@ endif ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) - @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) From 7dfc45e840ba8c10d5564a700f54deed0303e3b1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:09:50 +0200 Subject: [PATCH 042/108] Remove casts for PPC/POWER and complete parameters for POWER3/4 --- param.h | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/param.h b/param.h index a35ce69bd..ddad2fb36 100644 --- a/param.h +++ b/param.h @@ -72,13 +72,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H -#define LONGCAST (BLASLONG) -#if defined(__BYTE_ORDER__) -#if __GNUC__ < 9 -#undef LONGCAST -#define LONGCAST -#endif -#endif #define SBGEMM_DEFAULT_UNROLL_N 4 #define SBGEMM_DEFAULT_UNROLL_M 8 @@ -2096,7 +2089,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef PPCG4 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2127,7 +2120,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 2688 #define GEMM_DEFAULT_OFFSET_B 3072 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL +#define GEMM_DEFAULT_ALIGN 0x03fffUL #if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2176,7 +2169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2212,7 +2205,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2247,7 +2240,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER3) || defined(POWER4) || defined(POWER5) #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2271,6 +2264,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_Q 216 #define DGEMM_DEFAULT_R 1012 +#define CGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_Q 104 +#define CGEMM_DEFAULT_R 1012 + #define ZGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_Q 104 #define ZGEMM_DEFAULT_R 1012 @@ -2288,6 +2285,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 144 #define ZGEMM_DEFAULT_P 144 #endif + +#define SGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 #endif #if defined(POWER5) @@ -2320,7 +2322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL +#define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2353,7 +2355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #if defined(__32BIT__) #warning using BINARY32==POWER6 #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2406,7 +2408,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SWITCH_RATIO 16 #define GEMM_PREFERED_SIZE 16 @@ -2445,7 +2447,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SWITCH_RATIO 16 #define GEMM_PREFERED_SIZE 16 From 7a48247761be4caf9030bfc0d5863558a28787b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:11:56 +0200 Subject: [PATCH 043/108] fix c/zrot and sgemv for POWER5 --- kernel/power/KERNEL.POWER5 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/KERNEL.POWER5 b/kernel/power/KERNEL.POWER5 index fbef79e59..bea7b17c8 100644 --- a/kernel/power/KERNEL.POWER5 +++ b/kernel/power/KERNEL.POWER5 @@ -54,3 +54,8 @@ ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c +SGEMVNKERNEL = ../arm/gemv_n.c +SGEMVTKERNEL = ../arm/gemv_t.c From dc4fcb48df01a21be9d96c70c8ff66258fefd728 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:14:03 +0200 Subject: [PATCH 044/108] Fix inverted conditional for caxpy/zaxpy --- kernel/power/KERNEL.PPC440 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index 677af5f21..fd9a8c780 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -16,11 +16,11 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -CAXPYKERNEL = ../arm/zaxpy.c -ZAXPYKERNEL = ../arm/zaxpy.c -else CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S +else +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c endif SDOTKERNEL = dot_ppc440.S From fb9e678235a2e7ee7ce3a48263726d03b9827187 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:15:48 +0200 Subject: [PATCH 045/108] Fix caxpy/zaxpy for big-endian --- kernel/power/KERNEL.PPCG4 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index 54660b54d..1bdd3119e 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -15,8 +15,13 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S +else +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c +endif SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S From 08e2e60762b2b594a81479b766276224c4ae6bed Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:17:33 +0200 Subject: [PATCH 046/108] Add prefetch values for power3 --- kernel/power/gemv_n.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index abc61b62e..9c6f87639 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -159,6 +159,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 From 8adf0971d801a43d799a57b5721aedc7dec3e68d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:18:22 +0200 Subject: [PATCH 047/108] Add prefetch values for power3 --- kernel/power/gemv_t.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 25a4dd01b..accdad702 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -124,6 +124,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 48 #define PREFETCHSIZE_C 16 From 3906ef3b0fb19e7436f2b4cf6394b11f3466b1f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:19:40 +0200 Subject: [PATCH 048/108] Add prefetch values for power3 --- kernel/power/zgemv_t.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index d82fab16a..314cf5e6e 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -129,6 +129,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 From efdbdd8f8254988a851e7759277fb8d38d319c84 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:20:29 +0200 Subject: [PATCH 049/108] Add prefetch values for power3 --- kernel/power/zgemv_n.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index 708f1318d..48f49f97b 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -155,6 +155,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 From 9d292d37b2cea829763f8f6bf8e5f4053bbf2a00 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 14 Jun 2021 17:01:28 +0900 Subject: [PATCH 050/108] arm64: add the missing d9 register to the clobber list Refs. numpy/numpy#18422 Signed-off-by: Gilles Gouaillardet --- kernel/arm64/dznrm2_thunderx2t99.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index b021a2832..fba2fe8ce 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -321,7 +321,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", REGINF ); } From e6dd44d98976a34baa447886753dd6b7ec31c380 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 15 Jun 2021 13:07:47 -0500 Subject: [PATCH 051/108] Power10: Fix for SBGEMM While testing bfloat16 sbgemm kernel, there are some failures for odd value inputs due to updating result for additional bytes. --- kernel/power/sbgemm_kernel_power10.c | 34 ++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index 74f3eac4c..134929ec1 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -98,6 +98,30 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); rowC = (v2sf_t *) &CO[7* ldc+J]; \ rowC[0] += result[6] * alpha; + #define SAVE4x2_ACC_SCALAR(ACC) { \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + res[0] = result[0] * alpha; \ + res[1] = result[1] * alpha; \ + res[2] = result[2] * alpha; \ + res[3] = result[3] * alpha; \ + CO[0 * ldc] += res[0][0]; \ + CO[1 * ldc] += res[1][0]; \ + CO[2 * ldc] += res[2][0]; \ + CO[3 * ldc] += res[3][0]; \ + } + + #define SAVE4x2_ACC1_SCALAR(ACC) { \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + res[0] = result[0] * alpha; \ + res[1] = result[1] * alpha; \ + res[2] = result[2] * alpha; \ + res[3] = result[3] * alpha; \ + CO[4 * ldc] += res[0][0]; \ + CO[5 * ldc] += res[1][0]; \ + CO[6 * ldc] += res[2][0]; \ + CO[7 * ldc] += res[3][0]; \ +} + #define MMA __builtin_mma_xvbf16ger2pp #define SAVE2x4_ACC(ACC, J) \ @@ -313,7 +337,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { IFLOAT *BO = B; v2sf_t *rowC; - v2sf_t result[8]; + v4sf_t result[4], res[4]; __vector_quad acc0, acc1; __builtin_mma_xxsetaccz (&acc0); __builtin_mma_xxsetaccz (&acc1); @@ -335,8 +359,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } - SAVE4x2_ACC (&acc0, 0); - SAVE4x2_ACC1 (&acc1, 0); + SAVE4x2_ACC_SCALAR (&acc0); + SAVE4x2_ACC1_SCALAR (&acc1); CO += 1; AO += k; BO += (k << 3); @@ -547,7 +571,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { IFLOAT *BO = B; v2sf_t *rowC; - v2sf_t result[8]; + v4sf_t result[4], res[4]; __vector_quad acc0; BLASLONG l = 0; __builtin_mma_xxsetaccz (&acc0); @@ -571,7 +595,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, }; MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } - SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC_SCALAR (&acc0); AO += k; BO += (k << 2); CO += 1; From 92e024bbb30d4445ce48be982d2625cac3c1df49 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:33:23 +0200 Subject: [PATCH 052/108] Declare SCASUM as EXTERNAL --- lapack-netlib/TESTING/EIG/cbdt05.f | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cbdt05.f b/lapack-netlib/TESTING/EIG/cbdt05.f index 5a08ccce3..4ed157431 100644 --- a/lapack-netlib/TESTING/EIG/cbdt05.f +++ b/lapack-netlib/TESTING/EIG/cbdt05.f @@ -158,9 +158,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX - REAL SASUM, SLAMCH, CLANGE - EXTERNAL LSAME, ISAMAX, SASUM, SLAMCH, CLANGE - REAL SCASUM + REAL SASUM, SCASUM, SLAMCH, CLANGE + EXTERNAL LSAME, ISAMAX, SASUM, SCASUM, SLAMCH, CLANGE * .. * .. External Subroutines .. EXTERNAL CGEMM From 52693481784bbafba40edb2671540cccdb4c387e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:35:12 +0200 Subject: [PATCH 053/108] Declare CSROT as EXTERNAL --- lapack-netlib/TESTING/EIG/cckcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/cckcsd.f b/lapack-netlib/TESTING/EIG/cckcsd.f index 9783f0361..9524cb30b 100644 --- a/lapack-netlib/TESTING/EIG/cckcsd.f +++ b/lapack-netlib/TESTING/EIG/cckcsd.f @@ -228,7 +228,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, CCSDTS, CLACSG, CLAROR, - $ CLASET + $ CLASET, CSROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN From 9e1b43ea9b12fba1768d2b095149523704af76bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:39:28 +0200 Subject: [PATCH 054/108] Declare DROT as EXTERNAL --- lapack-netlib/TESTING/EIG/dckcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/dckcsd.f b/lapack-netlib/TESTING/EIG/dckcsd.f index 50db6baa0..063a5ef5c 100644 --- a/lapack-netlib/TESTING/EIG/dckcsd.f +++ b/lapack-netlib/TESTING/EIG/dckcsd.f @@ -226,7 +226,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, DCSDTS, DLACSG, DLAROR, - $ DLASET + $ DLASET, DROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN From e2621ef93ae32b0fef33437c91ed774aa469277a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:40:15 +0200 Subject: [PATCH 055/108] Declare SROT as EXTERNAL --- lapack-netlib/TESTING/EIG/sckcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/sckcsd.f b/lapack-netlib/TESTING/EIG/sckcsd.f index 5a6e4a099..be91eed51 100644 --- a/lapack-netlib/TESTING/EIG/sckcsd.f +++ b/lapack-netlib/TESTING/EIG/sckcsd.f @@ -226,7 +226,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, SCSDTS, SLACSG, SLAROR, - $ SLASET + $ SLASET, SROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN From cd0e4aadb1ee504371ba6fd516dcd5a3d9b65e95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:41:18 +0200 Subject: [PATCH 056/108] Declare ZDROT as EXTERNAL --- lapack-netlib/TESTING/EIG/zckcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/zckcsd.f b/lapack-netlib/TESTING/EIG/zckcsd.f index f77b111a4..92760337c 100644 --- a/lapack-netlib/TESTING/EIG/zckcsd.f +++ b/lapack-netlib/TESTING/EIG/zckcsd.f @@ -228,7 +228,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, ZCSDTS, ZLACSG, ZLAROR, - $ ZLASET + $ ZLASET, ZDROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN From 5958ffc9b6b01046f160c6afa085444cb3b0204a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:43:39 +0200 Subject: [PATCH 057/108] Declare DZASUM as EXTERNAL --- lapack-netlib/TESTING/EIG/zbdt05.f | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/zbdt05.f b/lapack-netlib/TESTING/EIG/zbdt05.f index bbf0208b7..f262351e4 100644 --- a/lapack-netlib/TESTING/EIG/zbdt05.f +++ b/lapack-netlib/TESTING/EIG/zbdt05.f @@ -158,9 +158,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX - DOUBLE PRECISION DASUM, DLAMCH, ZLANGE - EXTERNAL LSAME, IDAMAX, DASUM, DLAMCH, ZLANGE - DOUBLE PRECISION DZASUM + DOUBLE PRECISION DASUM, DZASUM, DLAMCH, ZLANGE + EXTERNAL LSAME, IDAMAX, DASUM, DZASUM, DLAMCH, ZLANGE * .. * .. External Subroutines .. EXTERNAL ZGEMM From 13fa9f737d11b5d59d7b941dadd51d8f9be25c52 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 12:17:25 +0200 Subject: [PATCH 058/108] Modify defines for CR and RC to work around name collision on Windows --- cmake/utils.cmake | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 29b5a067b..794d73d06 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -254,6 +254,19 @@ function(GenerateNamedObjects sources_in) # now add the object and set the defines set(obj_defines ${defines_in}) + list(FIND obj_defines "RC" def_idx) + if (${def_idx} GREATER -1) + # list(REMOVE_AT ${obj_defines} ${def_idx}) + list (REMOVE_ITEM obj_defines "RC") + list(APPEND obj_defines "RC=RC") + endif () + list(FIND obj_defines "CR" def_idx) + if (${def_idx} GREATER -1) + # list(REMOVE_AT ${obj_defines} ${def_idx}) + list (REMOVE_ITEM obj_defines "CR") + list(APPEND obj_defines "CR=CR") + endif () + if (use_cblas) set(obj_name "cblas_${obj_name}") list(APPEND obj_defines "CBLAS") From e83df9397581dc5413bcf36e9e29d5fdb3f68f70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 12:32:34 +0200 Subject: [PATCH 059/108] Work around another recent macro name collision with winnt.h --- driver/level3/Makefile | 48 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 78f32b961..e893d915e 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -425,7 +425,7 @@ cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -473,7 +473,7 @@ zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -521,7 +521,7 @@ xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -632,7 +632,7 @@ cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -680,7 +680,7 @@ zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -728,7 +728,7 @@ xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -1895,7 +1895,7 @@ cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -1943,7 +1943,7 @@ zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC)RC $< -o $(@F) xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2048,7 +2048,7 @@ cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2096,7 +2096,7 @@ zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2144,7 +2144,7 @@ xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2817,7 +2817,7 @@ cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2865,7 +2865,7 @@ zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2913,7 +2913,7 @@ xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -3025,7 +3025,7 @@ cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -3073,7 +3073,7 @@ zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -3121,7 +3121,7 @@ xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4288,7 +4288,7 @@ cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4336,7 +4336,7 @@ zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4384,7 +4384,7 @@ xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4441,7 +4441,7 @@ cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4489,7 +4489,7 @@ zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4537,7 +4537,7 @@ xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) From 307c4c078692f79ac4e064668aacfadc31496b41 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 13:41:16 +0200 Subject: [PATCH 060/108] Fix typo --- driver/level3/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index e893d915e..b8465d4ed 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC)RC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) From b7da75e4fdc05976600949d588912cc9a6b9d22e Mon Sep 17 00:00:00 2001 From: User User-User Date: Sat, 19 Jun 2021 21:37:51 +0200 Subject: [PATCH 061/108] WiP CORTEX A55 support --- KERNEL.CORTEXA55 | 196 +++++++++++++++++++++++++++++++++++++++++++++++ Makefile.arm64 | 22 ++++++ Makefile.system | 1 + TargetList.txt | 1 + cpuid_arm64.c | 11 ++- getarch.c | 15 ++++ param.h | 2 +- 7 files changed, 245 insertions(+), 3 deletions(-) create mode 100644 KERNEL.CORTEXA55 diff --git a/KERNEL.CORTEXA55 b/KERNEL.CORTEXA55 new file mode 100644 index 000000000..db322dd0d --- /dev/null +++ b/KERNEL.CORTEXA55 @@ -0,0 +1,196 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +else +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +endif +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/Makefile.arm64 b/Makefile.arm64 index 3858d7e3f..c23a0876e 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -57,6 +57,28 @@ endif endif endif +# Use a53 tunings because a55 is only available in GCC>=8.1 +ifeq ($(CORE), CORTEXA55) +ifeq ($(GCCVERSIONGTEQ7), 1) +ifeq ($(GCCVERSIONGTEQ8), 1) +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +endif +endif +endif + ifeq ($(CORE), THUNDERX) CCOMMON_OPT += -march=armv8-a -mtune=thunderx ifneq ($(F_COMPILER), NAG) diff --git a/Makefile.system b/Makefile.system index 2264b143b..0cd3e3a7c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -625,6 +625,7 @@ DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA72 DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 +DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 diff --git a/TargetList.txt b/TargetList.txt index d19964916..f93a629d8 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -92,6 +92,7 @@ CORTEXA57 CORTEXA72 CORTEXA73 NEOVERSEN1 +CORTEXA55 EMAG8180 FALKOR THUNDERX diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 5f5d7771b..a150301d1 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -36,6 +36,7 @@ size_t length=sizeof(value); #define CPU_ARMV8 1 // Arm #define CPU_CORTEXA53 2 +#define CPU_CORTEXA55 14 #define CPU_CORTEXA57 3 #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 @@ -67,7 +68,8 @@ static char *cpuname[] = { "EMAG8180", "NEOVERSEN1", "THUNDERX3T110", - "VORTEX" + "VORTEX", + "CORTEXA55" }; static char *cpuname_lower[] = { @@ -84,7 +86,8 @@ static char *cpuname_lower[] = { "emag8180", "neoversen1", "thunderx3t110", - "vortex" + "vortex", + "cortexa55" }; int get_feature(char *search) @@ -161,6 +164,8 @@ int detect(void) return CPU_CORTEXA73; else if (strstr(cpu_part, "0xd0c")) return CPU_NEOVERSEN1; + else if (strstr(cpu_part, "0xd05")) + return CPU_CORTEXA55; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) @@ -284,6 +289,8 @@ void get_cpuconfig(void) printf("#define %s\n", cpuname[d]); // Fall-through case CPU_ARMV8: + // case CPU_CORTEXA53; + // case CPU_CORTEXA55; // Minimum parameters for ARMv8 (based on A53) printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); diff --git a/getarch.c b/getarch.c index f48944f36..3bc8a0c3d 100644 --- a/getarch.c +++ b/getarch.c @@ -1159,6 +1159,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_CORTEXA55 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA55" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA55 " \ + "-DL1_CODE_SIZE=16384 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=65536 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa55" +#define CORENAME "CORTEXA55" +#else +#endif #ifdef FORCE_FALKOR #define FORCE diff --git a/param.h b/param.h index ddad2fb36..01048023f 100644 --- a/param.h +++ b/param.h @@ -2959,7 +2959,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 2048 -#elif defined(CORTEXA53) +#elif defined(CORTEXA53) || defined(CORTEXA55) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 From 39ef0880aea439d199c99031b59dd9bd9225b69d Mon Sep 17 00:00:00 2001 From: User User-User Date: Sat, 19 Jun 2021 21:49:58 +0200 Subject: [PATCH 062/108] copy conf --- kernel/arm64/KERNEL.CORTEXA55 | 196 ++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 kernel/arm64/KERNEL.CORTEXA55 diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 new file mode 100644 index 000000000..db322dd0d --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -0,0 +1,196 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +else +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +endif +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) From 9335d427406b5f14f545a17408d1e1ae0d666790 Mon Sep 17 00:00:00 2001 From: User User-User Date: Sat, 19 Jun 2021 22:21:39 +0200 Subject: [PATCH 063/108] add gcc8 version matching --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index 0cd3e3a7c..bb8c60e91 100644 --- a/Makefile.system +++ b/Makefile.system @@ -333,6 +333,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) +GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) From 6423b282a1f95bdaa69d87b4a6302532a0ef1f83 Mon Sep 17 00:00:00 2001 From: User User-User Date: Sun, 20 Jun 2021 14:19:41 +0200 Subject: [PATCH 064/108] dynamic_arch --- driver/others/dynamic_arm64.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 0b623c3ac..1bec91462 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -99,6 +99,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #else #define gotoblas_NEOVERSEN1 gotoblas_ARMV8 #endif +#ifdef DYN_CORTEX_A55 +extern gotoblas_t gotoblas_CORTEXA55; +#else +#define gotoblas_NEOVERSEN1 gotoblas_ARMV8 +#endif #else extern gotoblas_t gotoblas_CORTEXA53; extern gotoblas_t gotoblas_CORTEXA57; @@ -111,11 +116,12 @@ extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; extern gotoblas_t gotoblas_THUNDERX3T110; +extern gotoblas_t gotoblas_CORTEXA55; #endif extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 12 +#define NUM_CORETYPES 13 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -142,6 +148,7 @@ static char *corename[] = { "emag8180", "neoversen1", "thunderx3t110", + "cortexa55", "unknown" }; @@ -158,6 +165,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; + if (gotoblas == &gotoblas_CORTEXA55) return corename[12]; return corename[NUM_CORETYPES]; } @@ -189,6 +197,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); case 11: return (&gotoblas_THUNDERX3T110); + case 12: return (&gotoblas_CORTEXA55); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -247,6 +256,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_CORTEXA73; case 0xd0c: // Neoverse N1 return &gotoblas_NEOVERSEN1; + case 0xd05: // Cortex A55 + return &gotoblas_CORTEXA55; } break; case 0x42: // Broadcom From 548aa522e5be12f21eabeb66154f315047c92dc2 Mon Sep 17 00:00:00 2001 From: User User-User Date: Sun, 20 Jun 2021 15:29:25 +0200 Subject: [PATCH 065/108] remove misplaced file --- KERNEL.CORTEXA55 | 196 ----------------------------------------------- 1 file changed, 196 deletions(-) delete mode 100644 KERNEL.CORTEXA55 diff --git a/KERNEL.CORTEXA55 b/KERNEL.CORTEXA55 deleted file mode 100644 index db322dd0d..000000000 --- a/KERNEL.CORTEXA55 +++ /dev/null @@ -1,196 +0,0 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -else -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -endif -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) From 91e2b11d3cf423fd16e8081d34ea12e5fb032bdb Mon Sep 17 00:00:00 2001 From: User User-User Date: Sun, 20 Jun 2021 15:32:42 +0200 Subject: [PATCH 066/108] add to cmake listings too --- cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 2 +- cmake/system.cmake | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 4451f9eaa..154e59db6 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index da7686c33..d86e10035 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -177,7 +177,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") + elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index d6c71b774..34874827c 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -39,7 +39,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") set(TARGET "BARCELONA") endif () - if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") + if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") set(TARGET "ARMV7") endif () endif () From 750719528a624295e708fdd8ca31c42e5186120c Mon Sep 17 00:00:00 2001 From: User User-User Date: Sun, 20 Jun 2021 16:40:43 +0200 Subject: [PATCH 067/108] bugz --- driver/others/dynamic_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 1bec91462..04ceaaf6d 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -102,7 +102,7 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else -#define gotoblas_NEOVERSEN1 gotoblas_ARMV8 +#define gotoblas_CORTEXA55 gotoblas_ARMV8 #endif #else extern gotoblas_t gotoblas_CORTEXA53; From 130327e9af42ee405afe69cd63eef7707bc454a8 Mon Sep 17 00:00:00 2001 From: User User-User Date: Tue, 22 Jun 2021 23:58:59 +0200 Subject: [PATCH 068/108] OK --- cpuid_arm64.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a150301d1..041b04311 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -289,8 +289,6 @@ void get_cpuconfig(void) printf("#define %s\n", cpuname[d]); // Fall-through case CPU_ARMV8: - // case CPU_CORTEXA53; - // case CPU_CORTEXA55; // Minimum parameters for ARMv8 (based on A53) printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); From f0b822a7094e62fa187426029305acfc30772d8e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Jun 2021 10:11:01 +0200 Subject: [PATCH 069/108] Update cpuid_arm64.c --- cpuid_arm64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 041b04311..2a9399f7d 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -286,6 +286,7 @@ void get_cpuconfig(void) { case CPU_CORTEXA53: + case CPU_CORTEXA55: printf("#define %s\n", cpuname[d]); // Fall-through case CPU_ARMV8: From 3be660c0000606743ec0e747228f73435d190e8b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jun 2021 23:44:56 +0200 Subject: [PATCH 070/108] Add interface declarations for ?potri --- common_interface.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common_interface.h b/common_interface.h index b9ebb2772..318827920 100644 --- a/common_interface.h +++ b/common_interface.h @@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *); + int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); From 1f8bda71b9b07afccb5ab255d4a6156da60420fc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jun 2021 23:46:00 +0200 Subject: [PATCH 071/108] Add OPENBLAS_LOOPS support to potrf/potrs/potri benchmark --- benchmark/potrf.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 116d0cca5..8808203a5 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -99,14 +99,15 @@ int main(int argc, char *argv[]){ char *p; char btest = 'F'; - blasint m, i, j, info, uplos=0; - double flops; + blasint m, i, j, l, info, uplos=0; + double flops = 0.; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + double time1, timeg; argc--;argv++; @@ -119,6 +120,8 @@ int main(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_TEST"))) btest=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ @@ -129,19 +132,21 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - for(m = from; m <= to; m += step){ + for(m = from; m <= to; m += step){ + timeg=0.; + for (l = 0; l < loops; l++) { #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.; - a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.; } } @@ -192,8 +197,8 @@ int main(int argc, char *argv[]){ exit(1); } - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; + if ( btest == 'F') + timeg += getsec(); if ( btest == 'S' ) { @@ -214,9 +219,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, "Potrs info = %d\n", info); exit(1); } - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; - + timeg += getsec(); } if ( btest == 'I' ) @@ -232,11 +235,17 @@ int main(int argc, char *argv[]){ fprintf(stderr, "Potri info = %d\n", info); exit(1); } - - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; + timeg += getsec(); } - + } // loops + + time1 = timeg/(double)loops; + if ( btest == 'F') + flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; + if ( btest == 'S') + flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; + if ( btest == 'I') + flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest); From 1b5620b66e3a834932fb527cdaef6ce22ce07ed0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jun 2021 23:47:41 +0200 Subject: [PATCH 072/108] Add lower threshold for multithreading in ?potrf and ?potri --- interface/lapack/potrf.c | 7 +++++++ interface/lapack/potri.c | 3 +++ interface/lapack/zpotrf.c | 7 +++++++ interface/lapack/zpotri.c | 9 +++++++++ 4 files changed, 26 insertions(+) diff --git a/interface/lapack/potrf.c b/interface/lapack/potrf.c index dbd55f62f..3abc80133 100644 --- a/interface/lapack/potrf.c +++ b/interface/lapack/potrf.c @@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; +#ifndef DOUBLE + if (args.n <128) +#else + if (args.n <64) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/potri.c b/interface/lapack/potri.c index 2c0c64b6f..eb0fcbe70 100644 --- a/interface/lapack/potri.c +++ b/interface/lapack/potri.c @@ -121,6 +121,9 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; + if (args.n < 180) + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/zpotrf.c b/interface/lapack/zpotrf.c index c4cd99bf6..298efbbc1 100644 --- a/interface/lapack/zpotrf.c +++ b/interface/lapack/zpotrf.c @@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; +#ifndef DOUBLE + if (args.n < 64) +#else + if (args.n < 64) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/zpotri.c b/interface/lapack/zpotri.c index 8da211683..8748c6352 100644 --- a/interface/lapack/zpotri.c +++ b/interface/lapack/zpotri.c @@ -121,6 +121,15 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.n < 200) +#else + if (args.n < 150) +#endif + args.nthreads=1; + else +#endif + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif From 6ebcce229fdbc960795fd77488dcd84baffcc205 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 29 Jun 2021 17:17:34 +0200 Subject: [PATCH 073/108] Work around current conda/tqdm auto-update problem --- appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/appveyor.yml b/appveyor.yml index c9b2fa3a1..d575c5b7f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -47,6 +47,7 @@ environment: install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force + - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" From 06e3b07ecb8d06b1a30b650b00891d58294bb865 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Jul 2021 17:38:45 +0200 Subject: [PATCH 074/108] Handle OPENBLAS_LOOPS and OPENBLAS_TEST options --- benchmark/getri.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/benchmark/getri.c b/benchmark/getri.c index 98a860906..4c8891226 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -72,13 +72,17 @@ int main(int argc, char *argv[]){ FLOAT *a,*work; FLOAT wkopt[4]; blasint *ipiv; - blasint m, i, j, info,lwork; + blasint m, i, j, l, info,lwork; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + double time1,timeg; + + char *p; + char btest = 'I'; argc--;argv++; @@ -86,6 +90,9 @@ int main(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} + if ((p = getenv("OPENBLAS_TEST"))) btest=*p; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); @@ -124,32 +131,41 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE FLops Time Lwork\n"); for(m = from; m <= to; m += step){ - + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); - GETRF (&m, &m, a, &m, ipiv, &info); + for (l = 0; l < loops; l++) { + if (btest == 'F') begin(); + GETRF (&m, &m, a, &m, ipiv, &info); + if (btest == 'F') { + end(); + timeg += getsec(); + } if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - begin(); + if (btest == 'I') begin(); lwork = -1; GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); lwork = (blasint)wkopt[0]; GETRI(&m, a, &m, ipiv, work, &lwork, &info); - end(); + if (btest == 'I') end(); if (info) { fprintf(stderr, "failed compute inverse matrix .. %d\n", info); exit(1); } - time1 = getsec(); - + if (btest == 'I') + timeg += getsec(); + + } // loops + time1 = timeg/(double)loops; fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork); From dcfc5cf714923f6d9981c9fc2cdb5ce5b846c0ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Jul 2021 17:39:37 +0200 Subject: [PATCH 075/108] Handle OPENBLAS_LOOPS for more stable results --- benchmark/linpack.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 202035245..32ccb0386 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -72,17 +72,21 @@ int main(int argc, char *argv[]){ FLOAT *a, *b; blasint *ipiv; - blasint m, i, j, info; + blasint m, i, j, l, info; blasint unit = 1; int from = 1; int to = 200; int step = 1; + int loops = 1; FLOAT maxerr; - double time1, time2; + double time1, time2, timeg1,timeg2; + char *p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} @@ -110,9 +114,9 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Residual Decompose Solve Total\n"); for(m = from; m <= to; m += step){ - + timeg1 = timeg2 = 0.; fprintf(stderr, " %6d : ", (int)m); - + for (l = 0; l < loops; l++) { for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; @@ -138,7 +142,7 @@ int main(int argc, char *argv[]){ exit(1); } - time1 = getsec(); + timeg1 += getsec(); begin(); @@ -151,8 +155,10 @@ int main(int argc, char *argv[]){ exit(1); } - time2 = getsec(); - + timeg2 += getsec(); + } //loops + time1=timeg1/(double)loops; + time2=timeg2/(double)loops; maxerr = 0.; for(i = 0; i < m; i++){ From 726c44242b6d565577e00a3c6591ffee5db005ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Jul 2021 17:41:05 +0200 Subject: [PATCH 076/108] Add lower threshold for multithreading --- interface/lapack/getrf.c | 9 ++++++++- interface/lapack/zgetrf.c | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/interface/lapack/getrf.c b/interface/lapack/getrf.c index 02bb124b3..323370ebc 100644 --- a/interface/lapack/getrf.c +++ b/interface/lapack/getrf.c @@ -95,7 +95,14 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.m*args.n < 40000) +#else + if (args.m*args.n < 10000) +#endif + args.nthreads=1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/lapack/zgetrf.c b/interface/lapack/zgetrf.c index 7f8db94f6..d03541fad 100644 --- a/interface/lapack/zgetrf.c +++ b/interface/lapack/zgetrf.c @@ -95,7 +95,10 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); + if (args.m*args.n <10000) + args.nthreads = 1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif From 4620f988126d2e98b82fb28511fda29d27ef8bc4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Jul 2021 19:24:35 +0200 Subject: [PATCH 077/108] Mention availability of the Windows binaries in the Releases section --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 174f951f4..d7e0d60a7 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ We provide official binary packages for the following platform: * Windows x86/x86_64 -You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). +You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases). ## Installation from Source From a4543e4918f9c732d4701315d5b22de31a79f737 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Jul 2021 16:59:43 +0200 Subject: [PATCH 078/108] Handle OPENBLAS_LOOP --- benchmark/syrk.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 82606a21a..fa0f24666 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -56,17 +56,20 @@ int main(int argc, char *argv[]){ char uplo='U'; char trans='N'; - + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; - blasint m, i, j; + blasint m, i, j, l; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + + double time1,timeg; argc--;argv++; @@ -95,9 +98,12 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step) { + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); + for(l = 0; l < loops; l++) { + for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; @@ -111,8 +117,10 @@ int main(int argc, char *argv[]){ end(); - time1 = getsec(); - + timeg += getsec(); + + } //loops + time1 = timeg / (double)loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); From 8186963d8c454ba65325053eebe0a4328421755f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Jul 2021 17:00:26 +0200 Subject: [PATCH 079/108] Add lower limit for multithreading --- interface/syrk.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/interface/syrk.c b/interface/syrk.c index 7699db683..edb113d6c 100644 --- a/interface/syrk.c +++ b/interface/syrk.c @@ -354,6 +354,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif args.common = NULL; +#ifndef COMPLEX +#ifdef DOUBLE + if (args.n < 100) +#else + if (args.n < 200) +#endif +#else + if (args.n < 65) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { From 3cfdb1770c0a405e3d976184a46dc4a394dc9030 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Jul 2021 20:21:07 +0200 Subject: [PATCH 080/108] Remove code that disabled EXTRALIB on RISCV C910V --- test/Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/Makefile b/test/Makefile index 54fa60533..6c5f041c2 100644 --- a/test/Makefile +++ b/test/Makefile @@ -259,10 +259,6 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -ifeq ($(CORE), C910V) -EXTRALIB = -CEXTRALIB = -endif ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) From 0d8d261dd4936da6a11673ecaae54acb4e16ecad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Jul 2021 12:20:19 +0200 Subject: [PATCH 081/108] Recognize newer Zhaoxin/Centaur cpus as Nehalem --- cpuid_x86.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 18ff122e5..4553b89f1 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1631,7 +1631,9 @@ int get_cpuname(void){ case 0x6: return CPUTYPE_NANO; break; - + case 0x7: + return CPUTYPE_NEHALEM; + break; } return CPUTYPE_VIAC3; } @@ -2285,6 +2287,9 @@ int get_coretype(void){ case 0x6: return CORE_NANO; break; + case 0x7: + return CORE_NEHALEM; + break; } return CORE_VIAC3; } From eb2fdd3af0241759576988a4672dc76ab298538f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Jul 2021 12:23:15 +0200 Subject: [PATCH 082/108] Recognize newer Zhaoxin/Centaur processors as Nehalem --- driver/others/dynamic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 46ad06a7c..4212e868c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -824,6 +824,9 @@ static gotoblas_t *get_coretype(void){ switch (family) { case 0x6: return &gotoblas_NANO; + break; + case 0x7: + return &gotoblas_NEHALEM; } } From da623ae838ef8277a230004d88270b1fdb37235a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Jul 2021 18:26:23 +0200 Subject: [PATCH 083/108] Add vendor string Shanghai as the successor to Centaur --- cpuid_x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 4553b89f1..4737b1851 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -283,6 +283,7 @@ int get_vendor(void){ if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor, " Shanghai ")) return VENDOR_CENTAUR; if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; From 8f22ac552befbc414ce56db5c5142a7f0a5038ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Jul 2021 18:28:49 +0200 Subject: [PATCH 084/108] Add vendor string Shanghai as successor to Centaur --- driver/others/dynamic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4212e868c..1a33870db 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -404,6 +404,7 @@ static int get_vendor(void){ if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_CENTAUR; if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; From 2f6326a630f074489e8dcc0a53afce88fc800151 Mon Sep 17 00:00:00 2001 From: River Dillon Date: Sat, 10 Jul 2021 00:36:07 -0700 Subject: [PATCH 085/108] Remove --- driver/others/memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 63fa6a566..6e654ccf2 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1702,7 +1702,6 @@ inline int atoi(const char *str) { return 0; } #include #include #include -#include #include #include #include From 220f6a1c556a5cc94eb7bc230e64074bdc0a6d04 Mon Sep 17 00:00:00 2001 From: River Dillon Date: Sat, 10 Jul 2021 00:38:02 -0700 Subject: [PATCH 086/108] Add feature test macro for proper inclusion of --- openblas_config_template.h | 1 + 1 file changed, 1 insertion(+) diff --git a/openblas_config_template.h b/openblas_config_template.h index 858b8c5cb..1e17c9a16 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -99,5 +99,6 @@ typedef int blasint; /* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ #ifdef OPENBLAS_OS_LINUX +#define _GNU_SOURCE #include #endif From cecc2c65aad40f8f4a261ae503b92936c0b147f4 Mon Sep 17 00:00:00 2001 From: River Dillon Date: Sat, 10 Jul 2021 00:39:52 -0700 Subject: [PATCH 087/108] Add test of installed --- Makefile | 10 ++++++++-- test_install/Makefile | 15 +++++++++++++++ test_install/test_sched_include.c | 5 +++++ 3 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 test_install/Makefile create mode 100644 test_install/test_sched_include.c diff --git a/Makefile b/Makefile index 555d1c467..d31cc9c83 100644 --- a/Makefile +++ b/Makefile @@ -34,9 +34,9 @@ endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) -SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test +SUBDIRS_ALL = $(SUBDIRS) test ctest utest test_install exports benchmark ../laswp ../bench cpp_thread_test -.PHONY : all libs netlib $(RELA) test ctest shared install +.PHONY : all libs netlib $(RELA) test ctest test_install shared install .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test all :: libs netlib $(RELA) tests shared @@ -155,6 +155,11 @@ endif endif endif +test_install : + mkdir -p install + PREFIX=install $(MAKE) install + $(MAKE) -C test_install all + libs : ifeq ($(CORE), UNKNOWN) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) @@ -399,4 +404,5 @@ endif @$(MAKE) -C relapack clean @rm -f *.grd Makefile.conf_last config_last.h @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) + @rm -rf install @echo Done. diff --git a/test_install/Makefile b/test_install/Makefile new file mode 100644 index 000000000..29cd81322 --- /dev/null +++ b/test_install/Makefile @@ -0,0 +1,15 @@ +# +# tests of installed headers and libs +# + +INSTALLDIR = ../install + +.PHONY: all +all: test_sched_include + +test_sched_include: test_sched_include.c + $(CC) -c -I$(INSTALLDIR)/include $< + +.PHONY: clean +clean: + rm -f *.o diff --git a/test_install/test_sched_include.c b/test_install/test_sched_include.c new file mode 100644 index 000000000..aea35680d --- /dev/null +++ b/test_install/test_sched_include.c @@ -0,0 +1,5 @@ +// tests that inclusion of openblas_config.h works with musl + +#include + +cpu_set_t* cpu_set = NULL; From ddb6cee0d542464cef38c4b6532b4928df8807cc Mon Sep 17 00:00:00 2001 From: River Dillon Date: Sat, 10 Jul 2021 01:34:47 -0700 Subject: [PATCH 088/108] Contribution note --- CONTRIBUTORS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index be9a32a7c..6be41960c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -194,3 +194,6 @@ In chronological order: * PingTouGe Semiconductor Co., Ltd. * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 + +* River Dillon + * [2021-07-10] fix compilation with musl libc From 4f4e286bf67aeb92132f06dd1637e437d3ec759d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 18:20:40 +0200 Subject: [PATCH 089/108] Fix copy-paste error in LIBCORE assignment for Tiger Lake --- cpuid_x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 4737b1851..00fc8baa0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -2164,13 +2164,13 @@ int get_coretype(void){ case 8: if (model == 12) { // Tiger Lake if(support_avx512()) - return CPUTYPE_SKYLAKEX; + return CORE_SKYLAKEX; if(support_avx2()) - return CPUTYPE_HASWELL; + return CORE_HASWELL; if(support_avx()) - return CPUTYPE_SANDYBRIDGE; + return CORE_SANDYBRIDGE; else - return CPUTYPE_NEHALEM; + return CORE_NEHALEM; } if (model == 14) { // Kaby Lake if(support_avx()) From d5110630986c89ee88560b2204b7c157533a979e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 18:52:44 +0200 Subject: [PATCH 090/108] Move Alpine Linux build job from Travis to Azure --- azure-pipelines.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4b6b2b0e6..8bc27eb08 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -147,3 +147,15 @@ jobs: export ANDROID_NDK_HOME=/usr/local/share/android-ndk make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 +- job: ALPINE_MUSL + pool: + vmImage: 'ubuntu-16.04' + steps: + - script | + wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ + && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 + alpine() { /alpine/enter-chroot -u "$USER" "$@"; } + sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' + alpine make DYNAMIC_ARCH=1 BINARY=64 + alpine make DYNAMIC_ARCH=1 BINARY=64 install + From 89429fdaa2a859c5a1e44fc782a20a03b7fa6540 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 19:03:42 +0200 Subject: [PATCH 091/108] fix typo --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 8bc27eb08..65bc8e680 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -149,9 +149,9 @@ jobs: - job: ALPINE_MUSL pool: - vmImage: 'ubuntu-16.04' + vmImage: 'ubuntu-latest' steps: - - script | + - script: | wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } From d86290edf0edcc5f931c52dce3955348c40949f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 19:52:04 +0200 Subject: [PATCH 092/108] add sudo for install in Alpine --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 65bc8e680..6a7cc73e4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,5 +157,5 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine make DYNAMIC_ARCH=1 BINARY=64 install + alpine sudo make DYNAMIC_ARCH=1 BINARY=64 install From c9304199cfe6f7aa9d98b4d397e91edaf9a2929c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 20:12:33 +0200 Subject: [PATCH 093/108] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6a7cc73e4..cf43c0647 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -155,7 +155,7 @@ jobs: wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' + sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 alpine sudo make DYNAMIC_ARCH=1 BINARY=64 install From db57c449dc387d68b247ae0fe73bbb178a71118c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 20:57:21 +0200 Subject: [PATCH 094/108] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cf43c0647..47579aa2a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,5 +157,5 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine sudo make DYNAMIC_ARCH=1 BINARY=64 install + alpine echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install From 14e33e0f7e05e26b2b1cc2ced015c7722b0adc31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 21:27:53 +0200 Subject: [PATCH 095/108] Handle OPENBLAS_LOOPS in SYR2 benchmark --- benchmark/syr2.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/benchmark/syr2.c b/benchmark/syr2.c index acbc86987..61d1036ea 100644 --- a/benchmark/syr2.c +++ b/benchmark/syr2.c @@ -46,14 +46,17 @@ int main(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - blasint m, i, j; + blasint m, i, j, l; blasint inc_x= 1; blasint inc_y= 1; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + + double time1,timeg; argc--;argv++; @@ -85,8 +88,9 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step) { - + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); + for (l = 0; l < loops; l++) { for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } @@ -107,8 +111,10 @@ int main(int argc, char *argv[]){ end(); - time1 = getsec(); + timeg += getsec(); + } // loops + time1 = timeg/(double)loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6); From 7e09570e04dc715f98bfcbc2c9374707b29f7d94 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 22:41:49 +0200 Subject: [PATCH 096/108] Update azure-pipelines.yml --- azure-pipelines.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 47579aa2a..261b6877f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -158,4 +158,8 @@ jobs: sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 alpine echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install - + alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c + alpine echo "#include " >>test_install.c + alpine echo "cpu_set_t* cpu_set = NULL;" >>test_install.c + alpine gcc -I/opt/OpenBLAS/include test_install.c -lopenblas -lpthread -lgfortran -o test_install + From 0266ba7cb67aa3e31dae140442bf38841207cfe4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 23:21:58 +0200 Subject: [PATCH 097/108] Update azure-pipelines.yml --- azure-pipelines.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 261b6877f..2d7f597c1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -158,6 +158,7 @@ jobs: sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 alpine echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install + alpine ls -l /opt/OpenBLAS/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c alpine echo "cpu_set_t* cpu_set = NULL;" >>test_install.c From 69560ad3cec3bee4d1dbc7ceeeb9f345f3bfc46c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 07:25:07 +0200 Subject: [PATCH 098/108] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2d7f597c1..734c50d67 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,7 +157,7 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install + alpine (echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install) alpine ls -l /opt/OpenBLAS/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c From a27a61bb9adfc0b7adc36ea1945106feb0e03ccf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 08:24:20 +0200 Subject: [PATCH 099/108] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 734c50d67..368f4120e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,7 +157,7 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine (echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install) + alpine bash -c "echo ''|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install" alpine ls -l /opt/OpenBLAS/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c From c47e35acee00eb195175ec926aae7aebd7fa1dc9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 09:38:48 +0200 Subject: [PATCH 100/108] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 368f4120e..a9bb43da4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,7 +157,7 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine bash -c "echo ''|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install" + alpine sh -c "echo ''|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install" alpine ls -l /opt/OpenBLAS/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c From 8acb6fe3a86c093f993f97d8be14a98c80d10a2c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 11:29:52 +0200 Subject: [PATCH 101/108] Update azure-pipelines.yml --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a9bb43da4..6b4d6fad0 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,10 +157,10 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine sh -c "echo ''|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install" - alpine ls -l /opt/OpenBLAS/include + alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install" + alpine ls -l mytestdir/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c alpine echo "cpu_set_t* cpu_set = NULL;" >>test_install.c - alpine gcc -I/opt/OpenBLAS/include test_install.c -lopenblas -lpthread -lgfortran -o test_install + alpine gcc -Imytestdir/include test_install.c -lopenblas -lpthread -lgfortran -o test_install From d2693eac04c568bb7201371603ec3c46f657d1c8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 11:54:02 +0200 Subject: [PATCH 102/108] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6b4d6fad0..fa37e46a1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,7 +157,7 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install" + alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install alpine ls -l mytestdir/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c From 836c7fb9f5dc52402dad37ef7db8ff47d3870bda Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 14:37:38 +0200 Subject: [PATCH 103/108] Revert addition of test_install target --- Makefile | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index d31cc9c83..555d1c467 100644 --- a/Makefile +++ b/Makefile @@ -34,9 +34,9 @@ endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) -SUBDIRS_ALL = $(SUBDIRS) test ctest utest test_install exports benchmark ../laswp ../bench cpp_thread_test +SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test -.PHONY : all libs netlib $(RELA) test ctest test_install shared install +.PHONY : all libs netlib $(RELA) test ctest shared install .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test all :: libs netlib $(RELA) tests shared @@ -155,11 +155,6 @@ endif endif endif -test_install : - mkdir -p install - PREFIX=install $(MAKE) install - $(MAKE) -C test_install all - libs : ifeq ($(CORE), UNKNOWN) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) @@ -404,5 +399,4 @@ endif @$(MAKE) -C relapack clean @rm -f *.grd Makefile.conf_last config_last.h @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) - @rm -rf install @echo Done. From eba2cd951e5851060dfbf1a2843b967b657b393f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 14:38:49 +0200 Subject: [PATCH 104/108] Revert addition of test_install --- test_install/Makefile | 15 --------------- test_install/test_sched_include.c | 5 ----- 2 files changed, 20 deletions(-) delete mode 100644 test_install/Makefile delete mode 100644 test_install/test_sched_include.c diff --git a/test_install/Makefile b/test_install/Makefile deleted file mode 100644 index 29cd81322..000000000 --- a/test_install/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -# -# tests of installed headers and libs -# - -INSTALLDIR = ../install - -.PHONY: all -all: test_sched_include - -test_sched_include: test_sched_include.c - $(CC) -c -I$(INSTALLDIR)/include $< - -.PHONY: clean -clean: - rm -f *.o diff --git a/test_install/test_sched_include.c b/test_install/test_sched_include.c deleted file mode 100644 index aea35680d..000000000 --- a/test_install/test_sched_include.c +++ /dev/null @@ -1,5 +0,0 @@ -// tests that inclusion of openblas_config.h works with musl - -#include - -cpu_set_t* cpu_set = NULL; From 7bb59fceb73431ab06b49f6c0e19a028ef2f82d7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 16:00:29 +0200 Subject: [PATCH 105/108] Clean up some warnings --- interface/gemm.c | 2 ++ interface/gemv.c | 2 +- interface/ger.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index cd5d00589..10426fd8f 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -126,6 +126,7 @@ void NAME(char *TRANSA, char *TRANSB, #ifdef SMP double MNK; +#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -144,6 +145,7 @@ void NAME(char *TRANSA, char *TRANSB, #endif #endif #endif +#endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; diff --git a/interface/gemv.c b/interface/gemv.c index b6c2e6095..1f14cdb2c 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -203,7 +203,7 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { - GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); + GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); return; } diff --git a/interface/ger.c b/interface/ger.c index 1c72d51ec..af6ae8606 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -165,7 +165,7 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == 0.) return; if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) { - GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); + GER(m, n, 0, alpha, x, incx, y, incy, a, lda, NULL); return; } From b4cbfe66775063f55eea58c24446b8e8301fcf16 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 18:08:30 +0200 Subject: [PATCH 106/108] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index fa37e46a1..0e806dc91 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -162,5 +162,5 @@ jobs: alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c alpine echo "cpu_set_t* cpu_set = NULL;" >>test_install.c - alpine gcc -Imytestdir/include test_install.c -lopenblas -lpthread -lgfortran -o test_install + alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install From 498479b13e257dcfbbc5600ad405639f378aaf70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 18:29:17 +0200 Subject: [PATCH 107/108] Update azure-pipelines.yml --- azure-pipelines.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0e806dc91..889b920e3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -161,6 +161,7 @@ jobs: alpine ls -l mytestdir/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c - alpine echo "cpu_set_t* cpu_set = NULL;" >>test_install.c + alpine echo "int main(){" >> test_install.c + alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install From 239ff330f822a3057ff657d11d084cd6e095aa4f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 23:48:39 +0200 Subject: [PATCH 108/108] Update Changelog for 0.3.16 --- Changelog.txt | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 6c5cf573e..8cd101699 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,52 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.16 + 11-Jul-2021 + +common: + - drastically reduced the stack size requirements for running the LAPACK + testsuite (Reference-LAPACK PR 553) + - fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK + PR 564) + - expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode + - improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N + and DGEMV_N, for small input sizes and consecutive arguments + - improved performance of xGETRF, xPORTF and xPOTRI for small input sizes + by disabling multithreading + - fixed installing with BSD versions of the "install" utility + +RISCV: + - fixed the implementation of xIMIN + - improved the performance of DSDOT + - fixed linking of the tests on C910V with current vendor gcc + +POWER: +- fixed SBGEMM computation for some odd value inputs +- fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5 + +x86_64: + - improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus + - worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc + versions + - fixed compilation with MS Visual Studio versions older than 2017 + - fixed macro name collision with winnt.h from the latest Win10 SDK + - added cpu type autodetection for Intel Ice Lake SP + - fixed cpu type autodetection for Intel Tiger Lake + - added cpu type autodetection for recent Centaur/Zhaoxin models + - fixed compilation with musl libc + +ARM64: +- fixed compilation with gcc/gfortran on the Apple M1 +- fixed linking of the tests on FreeBSD +- fixed missing restore of a register in the recently rewritten DNRM2 kernel + for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g. + DGEEV +- added compiler optimization flags for the EMAG8180 +- added initial support for Cortex A55 + +ARM: +- fixed linking of the tests on FreeBSD + ==================================================================== Version 0.3.15 2-May-2021