diff --git a/Makefile.power b/Makefile.power index 37a02d692..e766f8499 100644 --- a/Makefile.power +++ b/Makefile.power @@ -17,6 +17,7 @@ endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mvsx -fno-fast-math +ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) CCOMMON_OPT += -mcpu=power8 -mtune=power8 @@ -24,10 +25,14 @@ else CCOMMON_OPT += -mcpu=power9 -mtune=power9 endif else +CCOMMON_OPT += -mcpu=power9 -mtune=power9 +endif +else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -O2 -frecursive -fno-fast-math +ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) FCOMMON_OPT += -mcpu=power8 -mtune=power8 @@ -35,6 +40,9 @@ else FCOMMON_OPT += -mcpu=power9 -mtune=power9 endif else +FCOMMON_OPT += -mcpu=power9 -mtune=power9 +endif +else FCOMMON_OPT += -O2 -Mrecursive endif endif diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index a041abac3..d7dcd68a3 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -526,7 +526,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG width, i, j, k; BLASLONG n, n_from, n_to; int mode, mask; - double dnum; + double dnum, di, dinum; if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); @@ -601,9 +601,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (nthreads - num_cpu > 1) { - double di = (double)i; + di = (double)i; - width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1) ); + dinum = di * di + dnum; + + if (dinum > 0) + width = (((BLASLONG)((sqrt(dinum) - di) + mask)/(mask+1)) * (mask+1) ); + else + width = (((BLASLONG)(- di + mask)/(mask+1)) * (mask+1) ); if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1) ); @@ -643,10 +648,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (nthreads - num_cpu > 1) { - double di = (double)i; + di = (double)i; - width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + dinum = di * di +dnum; + if (dinum > 0) + width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + else + width = (((BLASLONG)(- di + mask)/(mask+1)) * (mask+1)); + if ((width > n - i) || (width < mask)) width = n - i; } else { diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 4f48b7c87..aedaa308d 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -2513,7 +2513,7 @@ void LAPACK_zgesvdq( lapack_complex_double* U, lapack_int const* ldu, lapack_complex_double* V, lapack_int const* ldv, lapack_int* numrank, lapack_int* iwork, lapack_int const* liwork, - lapack_complex_float* cwork, lapack_int* lcwork, + lapack_complex_double* cwork, lapack_int* lcwork, double* rwork, lapack_int const* lrwork, lapack_int* info ); @@ -3650,45 +3650,58 @@ void LAPACK_zggrqf( lapack_int* info ); #define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD) -lapack_int LAPACK_sggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, - lapack_int* k, lapack_int* l, float* a, - lapack_int* lda, float* b, lapack_int* ldb, - float* alpha, float* beta, float* u, lapack_int* ldu, - float* v, lapack_int* ldv, float* q, lapack_int* ldq, - float* work, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_sggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + float* a, lapack_int const* lda, + float* b, lapack_int const* ldb, + float* alpha, float* beta, + float* u, lapack_int const* ldu, + float* v, lapack_int const* ldv, + float* q, lapack_int const* ldq, + float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD) -lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, - lapack_int* k, lapack_int* l, double* a, - lapack_int* lda, double* b, lapack_int* ldb, - double* alpha, double* beta, double* u, - lapack_int* ldu, double* v, lapack_int* ldv, double* q, - lapack_int* ldq, float* work, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_dggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + double* a, lapack_int const* lda, + double* b, lapack_int const* ldb, + double* alpha, double* beta, + double* u, lapack_int const* ldu, + double* v, lapack_int const* ldv, + double* q, lapack_int const* ldq, + double* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) -lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, - lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - float* alpha, float* beta, lapack_complex_float* u, - lapack_int* ldu, lapack_complex_float* v, - lapack_int* ldv, lapack_complex_float* q, - lapack_int* ldq, float* work, lapack_int* rwork, lapack_int* iwork, lapack_int *info ); +lapack_int LAPACK_cggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + lapack_complex_float* a, lapack_int const* lda, + lapack_complex_float* b, lapack_int const* ldb, + float* alpha, float* beta, + lapack_complex_float* u, lapack_int const* ldu, + lapack_complex_float* v, lapack_int const* ldv, + lapack_complex_float* q, lapack_int const* ldq, + lapack_complex_float* work, float* rwork, + lapack_int* iwork, lapack_int* info ); #define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) -lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, - lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - double* alpha, double* beta, - lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* v, lapack_int* ldv, - lapack_complex_double* q, lapack_int* ldq, - float* work, lapack_int* rwork, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_zggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + lapack_complex_double* a, lapack_int const* lda, + lapack_complex_double* b, lapack_int const* ldb, + double* alpha, double* beta, + lapack_complex_double* u, lapack_int const* ldu, + lapack_complex_double* v, lapack_int const* ldv, + lapack_complex_double* q, lapack_int const* ldq, + lapack_complex_double* work, double* rwork, + lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) void LAPACK_cggsvd3( @@ -3753,49 +3766,58 @@ void LAPACK_zggsvd3( lapack_int* info ); #define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP) -lapack_int LAPACK_sggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, float* a, - lapack_int* lda, float* b, lapack_int* ldb, float* tola, - float* tolb, lapack_int* k, lapack_int* l, float* u, - lapack_int* ldu, float* v, lapack_int* ldv, float* q, - lapack_int* ldq, lapack_int* iwork, float* tau, - float* work, lapack_int* info); +lapack_int LAPACK_sggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + float* a, lapack_int const* lda, + float* b, lapack_int const* ldb, + float* tola, float* tolb, + lapack_int* k, lapack_int* l, + float* u, lapack_int const* ldu, + float* v, lapack_int const* ldv, + float* q, lapack_int const* ldq, + lapack_int* iwork, float* tau, + float* work, lapack_int* info ); #define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP) -lapack_int LAPACK_dggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, double* a, - lapack_int* lda, double* b, lapack_int* ldb, - double* tola, double* tolb, lapack_int* k, - lapack_int* l, double* u, lapack_int* ldu, double* v, - lapack_int* ldv, double* q, lapack_int* ldq, - lapack_int* iwork, double* tau, double* work, - lapack_int* info); +lapack_int LAPACK_dggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + double* a, lapack_int const* lda, + double* b, lapack_int const* ldb, + double* tola, double* tolb, + lapack_int* k, lapack_int* l, + double* u, lapack_int const* ldu, + double* v, lapack_int const* ldv, + double* q, lapack_int const* ldq, + lapack_int* iwork, double* tau, + double* work, lapack_int* info ); #define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP) -lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* tola, - float* tolb, lapack_int* k, lapack_int* l, - lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* v, lapack_int* ldv, - lapack_complex_float* q, lapack_int* ldq, - lapack_int* iwork, lapack_int* rwork, - lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* info); +lapack_int LAPACK_cggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_float* a, lapack_int const* lda, + lapack_complex_float* b, lapack_int const* ldb, + float* tola, float* tolb, lapack_int* k, lapack_int* l, + lapack_complex_float* u, lapack_int const* ldu, + lapack_complex_float* v, lapack_int const* ldv, + lapack_complex_float* q, lapack_int const* ldq, + lapack_int* iwork, float* rwork, lapack_complex_float* tau, + lapack_complex_float* work, lapack_int* info ); #define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP) -lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - double* tola, double* tolb, lapack_int* k, - lapack_int* l, lapack_complex_double* u, - lapack_int* ldu, lapack_complex_double* v, - lapack_int* ldv, lapack_complex_double* q, - lapack_int* ldq, lapack_int* iwork, lapack_int* rwork, - lapack_complex_double* tau, lapack_complex_double* work, - lapack_int* info); +lapack_int LAPACK_zggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_double* a, lapack_int const* lda, + lapack_complex_double* b, lapack_int const* ldb, + double* tola, double* tolb, lapack_int* k, lapack_int* l, + lapack_complex_double* u, lapack_int const* ldu, + lapack_complex_double* v, lapack_int const* ldv, + lapack_complex_double* q, lapack_int const* ldq, + lapack_int* iwork, double* rwork, lapack_complex_double* tau, + lapack_complex_double* work, lapack_int* info ); #define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3) void LAPACK_cggsvp3( diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c index 91458136c..c5eca535e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c @@ -47,8 +47,8 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, lapack_complex_float* cwork = NULL; lapack_complex_float cwork_query; lapack_int lrwork = -1; - double* rwork = NULL; - double rwork_query; + float* rwork = NULL; + float rwork_query; lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); @@ -84,7 +84,7 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } - rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork ); + rwork = (float*)LAPACKE_malloc( sizeof(float) * lrwork ); if( rwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/TESTING/EIG/cchkhb2stg.f b/lapack-netlib/TESTING/EIG/cchkhb2stg.f index 61537f44b..cd884febf 100644 --- a/lapack-netlib/TESTING/EIG/cchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkhb2stg.f @@ -680,8 +680,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -753,8 +753,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH diff --git a/lapack-netlib/TESTING/EIG/schksb2stg.f b/lapack-netlib/TESTING/EIG/schksb2stg.f index 07b6fa95c..7308bb690 100644 --- a/lapack-netlib/TESTING/EIG/schksb2stg.f +++ b/lapack-netlib/TESTING/EIG/schksb2stg.f @@ -670,8 +670,8 @@ * the one from above. Compare it with D1 computed * using the SSBTRD. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL SLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH diff --git a/lapack-netlib/TESTING/EIG/schkst2stg.f b/lapack-netlib/TESTING/EIG/schkst2stg.f index f386ab43c..83edb9dce 100644 --- a/lapack-netlib/TESTING/EIG/schkst2stg.f +++ b/lapack-netlib/TESTING/EIG/schkst2stg.f @@ -999,8 +999,8 @@ * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL SLACPY( "U", N, N, A, LDA, V, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH