Merge pull request #81 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-09-05 12:47:03 +02:00 committed by GitHub
commit 0d1f30a297
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 127 additions and 87 deletions

View File

@ -17,6 +17,7 @@ endif
ifeq ($(CORE), POWER9)
ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mvsx -fno-fast-math
ifeq ($(C_COMPILER), GCC)
ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
CCOMMON_OPT += -mcpu=power8 -mtune=power8
@ -24,10 +25,14 @@ else
CCOMMON_OPT += -mcpu=power9 -mtune=power9
endif
else
CCOMMON_OPT += -mcpu=power9 -mtune=power9
endif
else
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
endif
ifneq ($(F_COMPILER), PGI)
FCOMMON_OPT += -O2 -frecursive -fno-fast-math
ifeq ($(C_COMPILER), GCC)
ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
FCOMMON_OPT += -mcpu=power8 -mtune=power8
@ -35,6 +40,9 @@ else
FCOMMON_OPT += -mcpu=power9 -mtune=power9
endif
else
FCOMMON_OPT += -mcpu=power9 -mtune=power9
endif
else
FCOMMON_OPT += -O2 -Mrecursive
endif
endif

View File

@ -526,7 +526,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
BLASLONG width, i, j, k;
BLASLONG n, n_from, n_to;
int mode, mask;
double dnum;
double dnum, di, dinum;
if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
@ -601,9 +601,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (nthreads - num_cpu > 1) {
double di = (double)i;
di = (double)i;
width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1) );
dinum = di * di + dnum;
if (dinum > 0)
width = (((BLASLONG)((sqrt(dinum) - di) + mask)/(mask+1)) * (mask+1) );
else
width = (((BLASLONG)(- di + mask)/(mask+1)) * (mask+1) );
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1) );
@ -643,10 +648,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (nthreads - num_cpu > 1) {
double di = (double)i;
di = (double)i;
width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
dinum = di * di +dnum;
if (dinum > 0)
width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
else
width = (((BLASLONG)(- di + mask)/(mask+1)) * (mask+1));
if ((width > n - i) || (width < mask)) width = n - i;
} else {

View File

@ -2513,7 +2513,7 @@ void LAPACK_zgesvdq(
lapack_complex_double* U, lapack_int const* ldu,
lapack_complex_double* V, lapack_int const* ldv, lapack_int* numrank,
lapack_int* iwork, lapack_int const* liwork,
lapack_complex_float* cwork, lapack_int* lcwork,
lapack_complex_double* cwork, lapack_int* lcwork,
double* rwork, lapack_int const* lrwork,
lapack_int* info );
@ -3650,45 +3650,58 @@ void LAPACK_zggrqf(
lapack_int* info );
#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD)
lapack_int LAPACK_sggsvd( char const* jobu, char const* jobv, char const* jobq,
lapack_int* m, lapack_int* n, lapack_int* p,
lapack_int* k, lapack_int* l, float* a,
lapack_int* lda, float* b, lapack_int* ldb,
float* alpha, float* beta, float* u, lapack_int* ldu,
float* v, lapack_int* ldv, float* q, lapack_int* ldq,
float* work, lapack_int* iwork, lapack_int* info );
lapack_int LAPACK_sggsvd(
char const* jobu, char const* jobv, char const* jobq,
lapack_int const* m, lapack_int const* n, lapack_int const* p,
lapack_int* k, lapack_int* l,
float* a, lapack_int const* lda,
float* b, lapack_int const* ldb,
float* alpha, float* beta,
float* u, lapack_int const* ldu,
float* v, lapack_int const* ldv,
float* q, lapack_int const* ldq,
float* work, lapack_int* iwork, lapack_int* info );
#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD)
lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq,
lapack_int* m, lapack_int* n, lapack_int* p,
lapack_int* k, lapack_int* l, double* a,
lapack_int* lda, double* b, lapack_int* ldb,
double* alpha, double* beta, double* u,
lapack_int* ldu, double* v, lapack_int* ldv, double* q,
lapack_int* ldq, float* work, lapack_int* iwork, lapack_int* info );
lapack_int LAPACK_dggsvd(
char const* jobu, char const* jobv, char const* jobq,
lapack_int const* m, lapack_int const* n, lapack_int const* p,
lapack_int* k, lapack_int* l,
double* a, lapack_int const* lda,
double* b, lapack_int const* ldb,
double* alpha, double* beta,
double* u, lapack_int const* ldu,
double* v, lapack_int const* ldv,
double* q, lapack_int const* ldq,
double* work, lapack_int* iwork, lapack_int* info );
#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD)
lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq,
lapack_int* m, lapack_int* n, lapack_int* p,
lapack_int* k, lapack_int* l,
lapack_complex_float* a, lapack_int* lda,
lapack_complex_float* b, lapack_int* ldb,
float* alpha, float* beta, lapack_complex_float* u,
lapack_int* ldu, lapack_complex_float* v,
lapack_int* ldv, lapack_complex_float* q,
lapack_int* ldq, float* work, lapack_int* rwork, lapack_int* iwork, lapack_int *info );
lapack_int LAPACK_cggsvd(
char const* jobu, char const* jobv, char const* jobq,
lapack_int const* m, lapack_int const* n, lapack_int const* p,
lapack_int* k, lapack_int* l,
lapack_complex_float* a, lapack_int const* lda,
lapack_complex_float* b, lapack_int const* ldb,
float* alpha, float* beta,
lapack_complex_float* u, lapack_int const* ldu,
lapack_complex_float* v, lapack_int const* ldv,
lapack_complex_float* q, lapack_int const* ldq,
lapack_complex_float* work, float* rwork,
lapack_int* iwork, lapack_int* info );
#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD)
lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq,
lapack_int* m, lapack_int* n, lapack_int* p,
lapack_int* k, lapack_int* l,
lapack_complex_double* a, lapack_int* lda,
lapack_complex_double* b, lapack_int* ldb,
double* alpha, double* beta,
lapack_complex_double* u, lapack_int* ldu,
lapack_complex_double* v, lapack_int* ldv,
lapack_complex_double* q, lapack_int* ldq,
float* work, lapack_int* rwork, lapack_int* iwork, lapack_int* info );
lapack_int LAPACK_zggsvd(
char const* jobu, char const* jobv, char const* jobq,
lapack_int const* m, lapack_int const* n, lapack_int const* p,
lapack_int* k, lapack_int* l,
lapack_complex_double* a, lapack_int const* lda,
lapack_complex_double* b, lapack_int const* ldb,
double* alpha, double* beta,
lapack_complex_double* u, lapack_int const* ldu,
lapack_complex_double* v, lapack_int const* ldv,
lapack_complex_double* q, lapack_int const* ldq,
lapack_complex_double* work, double* rwork,
lapack_int* iwork, lapack_int* info );
#define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3)
void LAPACK_cggsvd3(
@ -3753,49 +3766,58 @@ void LAPACK_zggsvd3(
lapack_int* info );
#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP)
lapack_int LAPACK_sggsvp( char const* jobu, char const* jobv, char const* jobq,
lapack_int* m, lapack_int* p, lapack_int* n, float* a,
lapack_int* lda, float* b, lapack_int* ldb, float* tola,
float* tolb, lapack_int* k, lapack_int* l, float* u,
lapack_int* ldu, float* v, lapack_int* ldv, float* q,
lapack_int* ldq, lapack_int* iwork, float* tau,
float* work, lapack_int* info);
lapack_int LAPACK_sggsvp(
char const* jobu, char const* jobv, char const* jobq,
lapack_int const* m, lapack_int const* p, lapack_int const* n,
float* a, lapack_int const* lda,
float* b, lapack_int const* ldb,
float* tola, float* tolb,
lapack_int* k, lapack_int* l,
float* u, lapack_int const* ldu,
float* v, lapack_int const* ldv,
float* q, lapack_int const* ldq,
lapack_int* iwork, float* tau,
float* work, lapack_int* info );
#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP)
lapack_int LAPACK_dggsvp( char const* jobu, char const* jobv, char const* jobq,
lapack_int* m, lapack_int* p, lapack_int* n, double* a,
lapack_int* lda, double* b, lapack_int* ldb,
double* tola, double* tolb, lapack_int* k,
lapack_int* l, double* u, lapack_int* ldu, double* v,
lapack_int* ldv, double* q, lapack_int* ldq,
lapack_int* iwork, double* tau, double* work,
lapack_int* info);
lapack_int LAPACK_dggsvp(
char const* jobu, char const* jobv, char const* jobq,
lapack_int const* m, lapack_int const* p, lapack_int const* n,
double* a, lapack_int const* lda,
double* b, lapack_int const* ldb,
double* tola, double* tolb,
lapack_int* k, lapack_int* l,
double* u, lapack_int const* ldu,
double* v, lapack_int const* ldv,
double* q, lapack_int const* ldq,
lapack_int* iwork, double* tau,
double* work, lapack_int* info );
#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP)
lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq,
lapack_int* m, lapack_int* p, lapack_int* n,
lapack_complex_float* a, lapack_int* lda,
lapack_complex_float* b, lapack_int* ldb, float* tola,
float* tolb, lapack_int* k, lapack_int* l,
lapack_complex_float* u, lapack_int* ldu,
lapack_complex_float* v, lapack_int* ldv,
lapack_complex_float* q, lapack_int* ldq,
lapack_int* iwork, lapack_int* rwork,
lapack_complex_float* tau, lapack_complex_float* work,
lapack_int* info);
lapack_int LAPACK_cggsvp(
char const* jobu, char const* jobv, char const* jobq,
lapack_int const* m, lapack_int const* p, lapack_int const* n,
lapack_complex_float* a, lapack_int const* lda,
lapack_complex_float* b, lapack_int const* ldb,
float* tola, float* tolb, lapack_int* k, lapack_int* l,
lapack_complex_float* u, lapack_int const* ldu,
lapack_complex_float* v, lapack_int const* ldv,
lapack_complex_float* q, lapack_int const* ldq,
lapack_int* iwork, float* rwork, lapack_complex_float* tau,
lapack_complex_float* work, lapack_int* info );
#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP)
lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq,
lapack_int* m, lapack_int* p, lapack_int* n,
lapack_complex_double* a, lapack_int* lda,
lapack_complex_double* b, lapack_int* ldb,
double* tola, double* tolb, lapack_int* k,
lapack_int* l, lapack_complex_double* u,
lapack_int* ldu, lapack_complex_double* v,
lapack_int* ldv, lapack_complex_double* q,
lapack_int* ldq, lapack_int* iwork, lapack_int* rwork,
lapack_complex_double* tau, lapack_complex_double* work,
lapack_int* info);
lapack_int LAPACK_zggsvp(
char const* jobu, char const* jobv, char const* jobq,
lapack_int const* m, lapack_int const* p, lapack_int const* n,
lapack_complex_double* a, lapack_int const* lda,
lapack_complex_double* b, lapack_int const* ldb,
double* tola, double* tolb, lapack_int* k, lapack_int* l,
lapack_complex_double* u, lapack_int const* ldu,
lapack_complex_double* v, lapack_int const* ldv,
lapack_complex_double* q, lapack_int const* ldq,
lapack_int* iwork, double* rwork, lapack_complex_double* tau,
lapack_complex_double* work, lapack_int* info );
#define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3)
void LAPACK_cggsvp3(

View File

@ -47,8 +47,8 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp,
lapack_complex_float* cwork = NULL;
lapack_complex_float cwork_query;
lapack_int lrwork = -1;
double* rwork = NULL;
double rwork_query;
float* rwork = NULL;
float rwork_query;
lapack_int i;
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 );
@ -84,7 +84,7 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp,
info = LAPACK_WORK_MEMORY_ERROR;
goto exit_level_0;
}
rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork );
rwork = (float*)LAPACKE_malloc( sizeof(float) * lrwork );
if( rwork == NULL ) {
info = LAPACK_WORK_MEMORY_ERROR;
goto exit_level_0;

View File

@ -680,8 +680,8 @@
* the one from above. Compare it with D1 computed
* using the DSBTRD.
*
CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 )
CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 )
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 )
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 )
CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU )
LH = MAX(1, 4*N)
LW = LWORK - LH
@ -753,8 +753,8 @@
* the one from above. Compare it with D1 computed
* using the DSBTRD.
*
CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 )
CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 )
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 )
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 )
CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU )
LH = MAX(1, 4*N)
LW = LWORK - LH

View File

@ -670,8 +670,8 @@
* the one from above. Compare it with D1 computed
* using the SSBTRD.
*
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 )
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 )
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N )
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N )
CALL SLACPY( ' ', K+1, N, A, LDA, U, LDU )
LH = MAX(1, 4*N)
LW = LWORK - LH

View File

@ -999,8 +999,8 @@
* the one from above. Compare it with D1 computed
* using the 1-stage.
*
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 )
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 )
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N )
CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N )
CALL SLACPY( "U", N, N, A, LDA, V, LDU )
LH = MAX(1, 4*N)
LW = LWORK - LH