From 4f371b0fbf8219270e37cb7827850ac13c8686d5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Mar 2020 23:45:58 +0100 Subject: [PATCH 01/20] Use POWER8 kernels on big-endian POWER9 for now --- kernel/power/KERNEL.POWER9 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 4bfa017e1..aabb5d976 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -1,3 +1,7 @@ +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +include $(KERNELDIR)/KERNEL.POWER8 +else + #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -206,3 +210,5 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +endif From f14013da7fda056a2ee42ccf88f14b46b91686ef Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Mar 2020 00:01:22 +0100 Subject: [PATCH 02/20] Update with 0.3.9 changes --- Changelog.txt | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index d66b2719a..5f924629b 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,48 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.9 + 1-Mar-2020 + + common: + * Fixed a miscompilation of the GETRF functions with CMAKE + * Imported bugfix 390 from LAPACK (missing NaN propagation in xCOMBSSQ) + * The size of the memory buffer used for splitting GEMM tasks across + multiple threads can now be configured in the build system. + +POWER: + * Fixed several compilation problems related to endianness + and ELF version on POWER8 and POWER9 + * Fixed use of the absolute value IAMIN/IAMAX instead of IMIN/IMAX + * Fixed a race condition in the level3 blas code + +MIPS64: + * Fixed use of the absoltute value IAMIN/IAMAX instead of IMIN/IMAX + +ARMV7: + * Fixed a race condition in the level3 blas code + * Fixed compilation on Android +ARMV8: + * Added support for Ampere EMAG8180 + * Added support for Neoverse N1 + * Improved performance of the blas_lock function + * Fixed a race condition in the level3 blas code + * Fixed a performance regression on TSV110-based servers + +x86_64: + * Fixed a long-standing error with undeclared register overwrites + in the DSCAL microkernel for HASWELL,SKYLAKEX and ZEN + * Fixed a long-standing bug in the SSE implementation of IAMAX + * Fixed a CMAKE build failure with DYNAMIC_ARCH + * Fixed cpu autodetection of Goldmont+, Cannon Lake and Ice Lake + * Fixed a compilation failure on OSX with compiler name containing dash + * Fixed compilation with MinGW on SkylakeX + * Improved speed of the AVX512 GEMM3M kernel on SkylakeX + * Added an AVX512 STRMM kernel for SkylakeX + * Improved GEMM performance on Haswell and Zen + +zarch: + * fixed compilation of the DYNAMIC_ARCH code + ==================================================================== Version 0.3.8 9-Feb-2020 From d221c50f2741b31b83e3cbcc005977cb0fe47bc3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Mar 2020 00:02:36 +0100 Subject: [PATCH 03/20] Add Ampere EMAG8180 --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 5b31df045..f4a40ed02 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -89,6 +89,7 @@ CORTEXA57 CORTEXA72 CORTEXA73 NEOVERSEN1 +EMAG8180 FALKOR THUNDERX THUNDERX2T99 From 43c2e845ab3931716b2ec7ad3ac3da2a8a447264 Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Sat, 22 Feb 2020 05:31:07 +0000 Subject: [PATCH 04/20] Switch blas_server to use acq/rel semantics Heavy-weight locking isn't required to pass the work queue pointer between threads and simple atomic acquire/release semantics can be used instead. This is especially important as pthread_mutex_lock() isn't fair. We've observed substantial variation in runtime because of the the unfairness of these locks which complety goes away with this implementation. The locks themselves are left to provide a portable way for idling threads to sleep/wakeup after many unsuccessful iterations waiting. --- driver/others/blas_server.c | 99 +++++++++++++++---------------------- 1 file changed, 41 insertions(+), 58 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 6f4e20610..ce028a7fc 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -140,6 +140,16 @@ typedef struct { } thread_status_t; +#if (__STDC_VERSION__ >= 201112L) +#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) +#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) +#else +#define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p)) +#define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v)) +#endif + + + static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE))); #ifndef THREAD_TIMEOUT @@ -312,20 +322,19 @@ blas_queue_t *tscq; last_tick = (unsigned int)rpcc(); - pthread_mutex_lock (&thread_status[cpu].lock); - tscq=thread_status[cpu].queue; - pthread_mutex_unlock (&thread_status[cpu].lock); + tscq = atomic_load_queue(&thread_status[cpu].queue); while(!tscq) { YIELDING; if ((unsigned int)rpcc() - last_tick > thread_timeout) { - pthread_mutex_lock (&thread_status[cpu].lock); - if (!thread_status[cpu].queue) { + if (!atomic_load_queue(&thread_status[cpu].queue)) { + pthread_mutex_lock (&thread_status[cpu].lock); thread_status[cpu].status = THREAD_STATUS_SLEEP; - while (thread_status[cpu].status == THREAD_STATUS_SLEEP) { + while (thread_status[cpu].status == THREAD_STATUS_SLEEP && + !atomic_load_queue(&thread_status[cpu].queue)) { #ifdef MONITOR main_status[cpu] = MAIN_SLEEPING; @@ -333,19 +342,18 @@ blas_queue_t *tscq; pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock); } + pthread_mutex_unlock(&thread_status[cpu].lock); } - pthread_mutex_unlock(&thread_status[cpu].lock); - last_tick = (unsigned int)rpcc(); } - pthread_mutex_lock (&thread_status[cpu].lock); - tscq=thread_status[cpu].queue; - pthread_mutex_unlock (&thread_status[cpu].lock); + + tscq = atomic_load_queue(&thread_status[cpu].queue); } - queue = thread_status[cpu].queue; + queue = atomic_load_queue(&thread_status[cpu].queue); + MB; if ((long)queue == -1) break; @@ -360,9 +368,7 @@ blas_queue_t *tscq; if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; - pthread_mutex_lock (&thread_status[cpu].lock); - thread_status[cpu].queue = (blas_queue_t *)1; - pthread_mutex_unlock (&thread_status[cpu].lock); + atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); sa = queue -> sa; sb = queue -> sb; @@ -442,13 +448,9 @@ blas_queue_t *tscq; // arm: make sure all results are written out _before_ // thread is marked as done and other threads use them - WMB; + MB; + atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0); - pthread_mutex_lock (&thread_status[cpu].lock); - thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ - pthread_mutex_unlock (&thread_status[cpu].lock); - - WMB; } @@ -566,12 +568,9 @@ int blas_thread_init(void){ for(i = 0; i < blas_num_threads - 1; i++){ - thread_status[i].queue = (blas_queue_t *)NULL; + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; - pthread_mutex_init(&thread_status[i].lock, NULL); - pthread_cond_init (&thread_status[i].wakeup, NULL); - #ifdef NEED_STACKATTR ret=pthread_create(&blas_threads[i], &attr, &blas_thread_server, (void *)i); @@ -655,7 +654,8 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ if (queue -> mode & BLAS_NODE) { do { - while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++; + + while((thread_status[i].node != node || atomic_load_queue(&thread_status[i].queue)) && (i < blas_num_threads - 1)) i ++; if (i < blas_num_threads - 1) break; @@ -669,36 +669,26 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ } while (1); } else { - pthread_mutex_lock (&thread_status[i].lock); - tsiq = thread_status[i].queue; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); while(tsiq) { i ++; if (i >= blas_num_threads - 1) i = 0; - pthread_mutex_lock (&thread_status[i].lock); - tsiq = thread_status[i].queue; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); } } #else - pthread_mutex_lock (&thread_status[i].lock); - tsiq=thread_status[i].queue ; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); while(tsiq) { i ++; if (i >= blas_num_threads - 1) i = 0; - pthread_mutex_lock (&thread_status[i].lock); - tsiq=thread_status[i].queue ; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); } #endif queue -> assigned = i; - WMB; - pthread_mutex_lock (&thread_status[i].lock); - thread_status[i].queue = queue; - pthread_mutex_unlock (&thread_status[i].lock); - WMB; + MB; + + atomic_store_queue(&thread_status[i].queue, queue); queue = queue -> next; pos ++; @@ -718,9 +708,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ pos = current -> assigned; - pthread_mutex_lock (&thread_status[pos].lock); - tspq=thread_status[pos].queue; - pthread_mutex_unlock (&thread_status[pos].lock); + tspq = atomic_load_queue(&thread_status[pos].queue); if ((BLASULONG)tspq > 1) { pthread_mutex_lock (&thread_status[pos].lock); @@ -752,24 +740,20 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ while ((num > 0) && queue) { - pthread_mutex_lock(&thread_status[queue->assigned].lock); - tsqq=thread_status[queue -> assigned].queue; - pthread_mutex_unlock(&thread_status[queue->assigned].lock); + tsqq = atomic_load_queue(&thread_status[queue->assigned].queue); while(tsqq) { YIELDING; - pthread_mutex_lock(&thread_status[queue->assigned].lock); - tsqq=thread_status[queue -> assigned].queue; - pthread_mutex_unlock(&thread_status[queue->assigned].lock); - - + tsqq = atomic_load_queue(&thread_status[queue->assigned].queue); }; queue = queue -> next; num --; } + MB; + #ifdef SMP_DEBUG fprintf(STDERR, "Done.\n\n"); #endif @@ -880,7 +864,7 @@ void goto_set_num_threads(int num_threads) { for(i = blas_num_threads - 1; i < num_threads - 1; i++){ - thread_status[i].queue = (blas_queue_t *)NULL; + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; pthread_mutex_init(&thread_status[i].lock, NULL); @@ -971,12 +955,11 @@ int BLASFUNC(blas_thread_shutdown)(void){ for (i = 0; i < blas_num_threads - 1; i++) { + pthread_mutex_lock (&thread_status[i].lock); - thread_status[i].queue = (blas_queue_t *)-1; - + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); thread_status[i].status = THREAD_STATUS_WAKEUP; - pthread_cond_signal (&thread_status[i].wakeup); pthread_mutex_unlock(&thread_status[i].lock); From 917d243580701952a25601f854b0232a2dade8fb Mon Sep 17 00:00:00 2001 From: MacChen02 <58972037+MacChen02@users.noreply.github.com> Date: Mon, 2 Mar 2020 14:36:27 +0800 Subject: [PATCH 05/20] Update benchmark statistical time function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function gettimeofday does not count the time,when testing the axpy small data volume use case. Use the function clock_gettime to replace the gettimeofday function to count the time. --- benchmark/axpy.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmark/axpy.c b/benchmark/axpy.c index 37c7aeb63..e40f93c70 100644 --- a/benchmark/axpy.c +++ b/benchmark/axpy.c @@ -128,7 +128,7 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; + struct timespec start, stop; double time1,timeg; argc--;argv++; @@ -175,13 +175,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + clock_gettime( CLOCK_REALTIME, &start); AXPY (&m, alpha, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + clock_gettime( CLOCK_REALTIME, &stop); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; timeg += time1; @@ -190,7 +190,7 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops %10.6f sec\n", + " %10.2f MFlops %10.9f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } From 0f65c05cd1f83187863546a0e5a11bbc633ab85c Mon Sep 17 00:00:00 2001 From: jianghesong Date: Mon, 2 Mar 2020 19:13:45 +0800 Subject: [PATCH 06/20] fix core dumped error --- benchmark/cholesky.c | 54 ++++++++++++++++++++++++-------------------- benchmark/geev.c | 2 +- benchmark/gemm3m.c | 6 ++--- benchmark/gemv.c | 4 ++-- benchmark/ger.c | 2 +- benchmark/gesv.c | 6 ++--- benchmark/getri.c | 2 +- benchmark/hemm.c | 6 ++--- benchmark/hemv.c | 2 +- benchmark/her2k.c | 6 ++--- benchmark/herk.c | 4 ++-- benchmark/linpack.c | 4 ++-- benchmark/potrf.c | 36 ++++++++++++++--------------- benchmark/symm.c | 6 ++--- benchmark/symv.c | 2 +- benchmark/syr2k.c | 6 ++--- benchmark/syrk.c | 4 ++-- benchmark/trmm.c | 4 ++-- benchmark/trsm.c | 4 ++-- 19 files changed, 83 insertions(+), 77 deletions(-) diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c index 8d121efb3..5908b6085 100644 --- a/benchmark/cholesky.c +++ b/benchmark/cholesky.c @@ -173,46 +173,46 @@ int main(int argc, char *argv[]){ #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { - for(i = 0; i < j; i++) a[i + j * m] = 0.; - a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; - for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; + for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; } } else { for (j = 0; j < m; j++) { - for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; - for(i = j + 1; i < m; i++) a[i + j * m] = 0.; + for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.; } } #else if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { - a[(i + j * m) * 2 + 0] = 0.; - a[(i + j * m) * 2 + 1] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 1] = 0.; } - a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; - a[(j + j * m) * 2 + 1] = 0.; + a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { - a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { - a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } - a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; - a[(j + j * m) * 2 + 1] = 0.; + a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { - a[(i + j * m) * 2 + 0] = 0.; - a[(i + j * m) * 2 + 1] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 1] = 0.; } } } @@ -239,10 +239,13 @@ int main(int argc, char *argv[]){ for (j = 0; j < m; j++) { for(i = 0; i <= j; i++) { #ifndef COMPLEX - if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]); + if (maxerr < fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m])) + maxerr = fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m]); #else - if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]); - if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]); + if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0])) + maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0]); + if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1])) + maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1]); #endif } } @@ -250,10 +253,13 @@ int main(int argc, char *argv[]){ for (j = 0; j < m; j++) { for(i = j; i < m; i++) { #ifndef COMPLEX - if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]); + if (maxerr < fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m])) + maxerr = fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m]); #else - if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]); - if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]); + if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0])) + maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0]); + if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1])) + maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1]); #endif } } diff --git a/benchmark/geev.c b/benchmark/geev.c index d3751defb..ef9271220 100644 --- a/benchmark/geev.c +++ b/benchmark/geev.c @@ -195,7 +195,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < to; j++){ for(i = 0; i < to * COMPSIZE; i++){ - a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c index d39543585..f4048c436 100644 --- a/benchmark/gemm3m.c +++ b/benchmark/gemm3m.c @@ -181,9 +181,9 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/gemv.c b/benchmark/gemv.c index adf8f3d91..a9dee67d2 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -197,7 +197,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } @@ -234,7 +234,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/ger.c b/benchmark/ger.c index a752a3c3e..ca7e94e15 100644 --- a/benchmark/ger.c +++ b/benchmark/ger.c @@ -182,7 +182,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/gesv.c b/benchmark/gesv.c index 26ff8bc1a..80f644e69 100644 --- a/benchmark/gesv.c +++ b/benchmark/gesv.c @@ -177,20 +177,20 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - b[i + j * m * COMPSIZE] = 0.0; + b[(long)i + (long)j * (long)m * COMPSIZE] = 0.0; } } for (j = 0; j < m; ++j) { for (i = 0; i < m * COMPSIZE; ++i) { - b[i] += a[i + j * m * COMPSIZE]; + b[i] += a[(long)i + (long)j * (long)m * COMPSIZE]; } } diff --git a/benchmark/getri.c b/benchmark/getri.c index 083cdc9aa..e8b82a758 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -172,7 +172,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < to; j++){ for(i = 0; i < to * COMPSIZE; i++){ - a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/hemm.c b/benchmark/hemm.c index 318c407ba..a0c549292 100644 --- a/benchmark/hemm.c +++ b/benchmark/hemm.c @@ -164,9 +164,9 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/hemv.c b/benchmark/hemv.c index 05028e3cf..b6ff512ce 100644 --- a/benchmark/hemv.c +++ b/benchmark/hemv.c @@ -167,7 +167,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/her2k.c b/benchmark/her2k.c index 028e2718f..55421878a 100644 --- a/benchmark/her2k.c +++ b/benchmark/her2k.c @@ -163,9 +163,9 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/herk.c b/benchmark/herk.c index d2e25ff46..bd336e6b1 100644 --- a/benchmark/herk.c +++ b/benchmark/herk.c @@ -162,8 +162,8 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 7d5c87163..e4b20e99d 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -186,7 +186,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } @@ -194,7 +194,7 @@ int main(int argc, char *argv[]){ for (j = 0; j < m; ++j) { for (i = 0; i < m * COMPSIZE; ++i) { - b[i] += a[i + j * m * COMPSIZE]; + b[i] += a[(long)i + (long)j * (long)m * COMPSIZE]; } } diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 1d714549b..580e46072 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -170,46 +170,46 @@ int main(int argc, char *argv[]){ #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { - for(i = 0; i < j; i++) a[i + j * m] = 0.; - a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; - for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; + for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; } } else { for (j = 0; j < m; j++) { - for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; - for(i = j + 1; i < m; i++) a[i + j * m] = 0.; + for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.; } } #else if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { - a[(i + j * m) * 2 + 0] = 0.; - a[(i + j * m) * 2 + 1] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 1] = 0.; } - a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; - a[(j + j * m) * 2 + 1] = 0.; + a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { - a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { - a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } - a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; - a[(j + j * m) * 2 + 1] = 0.; + a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { - a[(i + j * m) * 2 + 0] = 0.; - a[(i + j * m) * 2 + 1] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 1] = 0.; } } } diff --git a/benchmark/symm.c b/benchmark/symm.c index 35ebcee97..9c26d92fe 100644 --- a/benchmark/symm.c +++ b/benchmark/symm.c @@ -175,9 +175,9 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/symv.c b/benchmark/symv.c index df2a5d301..789c3560f 100644 --- a/benchmark/symv.c +++ b/benchmark/symv.c @@ -177,7 +177,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c index 9840b5f3e..6b51e4f2b 100644 --- a/benchmark/syr2k.c +++ b/benchmark/syr2k.c @@ -175,9 +175,9 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 34817f2bb..06582b861 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -172,8 +172,8 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/trmm.c b/benchmark/trmm.c index 54c7972db..6a5e59c7b 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -175,8 +175,8 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/trsm.c b/benchmark/trsm.c index 9eae3380c..6ce1d532c 100644 --- a/benchmark/trsm.c +++ b/benchmark/trsm.c @@ -191,8 +191,8 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } From 7ca4ffdbddd03697e1c9773f48fa8e890a2d3b86 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Wed, 19 Feb 2020 18:24:01 +0100 Subject: [PATCH 07/20] Improve test coverage for utests. --- utest/CMakeLists.txt | 1 + utest/Makefile | 2 +- utest/test_amax.c | 13 +++++- utest/test_min.c | 100 +++++++++++++++++++++++++++++++++++++++++++ utest/utest_main2.c | 86 ++++++++++++++++++++++++++++++++++++- 5 files changed, 199 insertions(+), 3 deletions(-) create mode 100644 utest/test_min.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 544646911..dc5175fc5 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -6,6 +6,7 @@ if (MSVC AND "${CMAKE_C_COMPILER_ID}" MATCHES Clang) else () set(OpenBLAS_utest_src utest_main.c + test_min.c test_amax.c test_ismin.c test_rotmg.c diff --git a/utest/Makefile b/utest/Makefile index 32bdcc6e1..0b9892411 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,7 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o +OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_amax.c b/utest/test_amax.c index 411598410..831804027 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -40,6 +40,17 @@ CTEST(amax, samax){ te_max=BLASFUNC(samax)(&N, x, &inc); tr_max=3.3; - + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } + +CTEST(amax, damax){ + blasint N=3, inc=1; + double te_max=0.0, tr_max=0.0; + double x[]={-1.1, 2.2, -3.3}; + + te_max=BLASFUNC(damax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} diff --git a/utest/test_min.c b/utest/test_min.c new file mode 100644 index 000000000..fd31b5982 --- /dev/null +++ b/utest/test_min.c @@ -0,0 +1,100 @@ +/***************************************************************************** +Copyright (c) 2011-2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +CTEST(min, smin_negative){ + blasint N=3, inc=1; + float te_min=0.0, tr_min=0.0; + float x[]={-1.1, -2.2, -3.3}; + + te_min=BLASFUNC(smin)(&N, x, &inc); + tr_min=-3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} + +CTEST(min, dmin_positive){ + blasint N=3, inc=1; + double te_min=0.0, tr_min=0.0; + double x[]={1.1, 0.0, 3.3}; + + te_min=BLASFUNC(dmin)(&N, x, &inc); + tr_min=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} + +CTEST(min, smin_zero){ + blasint N=3, inc=1; + float te_min=0.0, tr_min=0.0; + float x[]={1.1, 2.2, 0.0}; + + te_min=BLASFUNC(smin)(&N, x, &inc); + tr_min=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} + +CTEST(max, smax_negative){ + blasint N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, -2.2, -3.3}; + + te_max=BLASFUNC(smax)(&N, x, &inc); + tr_max=-1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} + +CTEST(max, dmax_positive){ + blasint N=3, inc=1; + double te_max=0.0, tr_max=0.0; + double x[]={1.1, 0.0, 3.3}; + + te_max=BLASFUNC(dmax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} + +CTEST(max, smax_zero){ + blasint N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, -2.2, 0.0}; + + te_max=BLASFUNC(smax)(&N, x, &inc); + tr_max=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} diff --git a/utest/utest_main2.c b/utest/utest_main2.c index aa95a5a3f..6b252863a 100644 --- a/utest/utest_main2.c +++ b/utest/utest_main2.c @@ -50,6 +50,17 @@ CTEST(amax, samax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } +CTEST(amax, damax){ + blasint N=3, inc=1; + double te_max=0.0, tr_max=0.0; + double x[]={-1.1, 2.2, -3.3}; + + te_max=BLASFUNC(damax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} + CTEST (drotmg,rotmg) { double te_d1, tr_d1; @@ -508,9 +519,82 @@ CTEST(swap,cswap_inc_0) } } +CTEST(min, smin_negative){ + blasint N=3, inc=1; + float te_min=0.0, tr_min=0.0; + float x[]={-1.1, -2.2, -3.3}; + + te_min=BLASFUNC(smin)(&N, x, &inc); + tr_min=-3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} + +CTEST(min, dmin_positive){ + blasint N=3, inc=1; + double te_min=0.0, tr_min=0.0; + double x[]={1.1, 0.0, 3.3}; + + te_min=BLASFUNC(dmin)(&N, x, &inc); + tr_min=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} + +CTEST(min, smin_zero){ + blasint N=3, inc=1; + float te_min=0.0, tr_min=0.0; + float x[]={1.1, 2.2, 0.0}; + + te_min=BLASFUNC(smin)(&N, x, &inc); + tr_min=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} + +CTEST(max, smax_negative){ + blasint N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, -2.2, -3.3}; + + te_max=BLASFUNC(smax)(&N, x, &inc); + tr_max=-1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} + +CTEST(max, dmax_positive){ + blasint N=3, inc=1; + double te_max=0.0, tr_max=0.0; + double x[]={1.1, 0.0, 3.3}; + + te_max=BLASFUNC(dmax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} + +CTEST(max, smax_zero){ + blasint N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, -2.2, 0.0}; + + te_max=BLASFUNC(smax)(&N, x, &inc); + tr_max=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} + int main(int argc, const char ** argv){ - CTEST_ADD(amax, samax); + CTEST_ADD (amax, samax); + CTEST_ADD (amax, damax); + CTEST_ADD (min, smin_negative); + CTEST_ADD (min, dmin_positive); + CTEST_ADD (min, smin_zero); + CTEST_ADD (max, smax_negative); + CTEST_ADD (max, dmax_positive); + CTEST_ADD (max, smax_zero); CTEST_ADD (drotmg,rotmg); CTEST_ADD (drotmg,rotmg_issue1452); CTEST_ADD (drotmg,rotmg_D1eqD2_X1eqX2); From 21f6c4b5a972683f7228e5ad446bc940947c2d2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D9=85=D9=87=D8=AF=D9=8A=20=D8=B4=D9=8A=D9=86=D9=88=D9=86?= =?UTF-8?q?=20=28Mehdi=20Chinoune=29?= Date: Mon, 2 Mar 2020 17:22:28 +0100 Subject: [PATCH 08/20] fixes #2480 --- cmake/cc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 22217575c..d5551147c 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -99,7 +99,7 @@ endif () if (${CORE} STREQUAL "SKYLAKEX") if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) - set (CCOMMON_OPT = "${CCOMMON_OPT} -march=skylake-avx512") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") endif () endif () endif () From 790d50fbba8b1c41be7d0feb3865e5870d61c81f Mon Sep 17 00:00:00 2001 From: wuanjun 00447568 Date: Tue, 3 Mar 2020 17:13:49 +0800 Subject: [PATCH 09/20] [OpenBlas]: add benchmark file trmv.c and modify benchmark/Makefile to test s/d/c/ztrmv --- benchmark/Makefile | 87 +++++++++++++++++++++++ benchmark/trmv.c | 170 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 257 insertions(+) create mode 100644 benchmark/trmv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 1d4a220e4..c037dd6d6 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -73,6 +73,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ @@ -100,6 +101,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -128,6 +130,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -155,6 +158,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -183,6 +187,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ @@ -209,6 +214,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -237,6 +243,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -266,6 +273,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -304,6 +312,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ + strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ @@ -1108,6 +1117,72 @@ zgemv.mkl : zgemv.$(SUFFIX) zgemv.veclib : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Strmv #################################################### +strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmv.acml : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.atlas : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.mkl : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.veclib : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmv #################################################### +dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmv.acml : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.atlas : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.mkl : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.veclib : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmv #################################################### + +ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmv.acml : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.atlas : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.mkl : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.veclib : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmv #################################################### + +ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmv.acml : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.atlas : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.mkl : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.veclib : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2177,6 +2252,18 @@ cgemv.$(SUFFIX) : gemv.c zgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +strmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/trmv.c b/benchmark/trmv.c new file mode 100644 index 000000000..84d1903de --- /dev/null +++ b/benchmark/trmv.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#undef TRMV + +#ifndef COMPLEX + +#ifdef DOUBLE +#define TRMV BLASFUNC(dtrmv) +#else +#define TRMV BLASFUNC(strmv) +#endif + +#else + +#ifdef DOUBLE +#define TRMV BLASFUNC(ztrmv) +#else +#define TRMV BLASFUNC(ctrmv) +#endif + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) +{ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1) { + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *a, *x; + char *p; + + char uplo ='U'; + char trans='N'; + char diag ='U'; + + int loops = 1; + int l; + blasint inc_x=1; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + long n, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from, + to, step, uplo, trans, diag, loops, inc_x); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(n = from; n <= to; n += step) { + timeg=0; + + fprintf(stderr, " %6d : ", (int)n); + for(j = 0; j < n; j++) { + for(i = 0; i < n * COMPSIZE; i++) { + a[i + j * n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + clock_gettime(CLOCK_REALTIME, &start); + TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x); + clock_gettime(CLOCK_REALTIME, &stop); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + timeg += time1; + } + + timeg /= loops; + fprintf(stderr, " %10.2f MFlops %12.9f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg); + } + + return 0; +} \ No newline at end of file From f682d19ed4b247903e37254ac2c4d2f136c237b0 Mon Sep 17 00:00:00 2001 From: wuanjun 00447568 Date: Tue, 3 Mar 2020 17:13:49 +0800 Subject: [PATCH 10/20] [OpenBlas]: add benchmark file trmv.c and modify benchmark/Makefile to test s/d/c/ztrmv --- benchmark/Makefile | 87 +++++++++++++++++++++++ benchmark/trmv.c | 172 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 259 insertions(+) create mode 100644 benchmark/trmv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 1d4a220e4..c037dd6d6 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -73,6 +73,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ @@ -100,6 +101,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -128,6 +130,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -155,6 +158,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -183,6 +187,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ @@ -209,6 +214,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -237,6 +243,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -266,6 +273,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -304,6 +312,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ + strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ @@ -1108,6 +1117,72 @@ zgemv.mkl : zgemv.$(SUFFIX) zgemv.veclib : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Strmv #################################################### +strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmv.acml : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.atlas : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.mkl : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.veclib : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmv #################################################### +dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmv.acml : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.atlas : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.mkl : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.veclib : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmv #################################################### + +ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmv.acml : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.atlas : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.mkl : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.veclib : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmv #################################################### + +ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmv.acml : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.atlas : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.mkl : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.veclib : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2177,6 +2252,18 @@ cgemv.$(SUFFIX) : gemv.c zgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +strmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/trmv.c b/benchmark/trmv.c new file mode 100644 index 000000000..969f4f1d4 --- /dev/null +++ b/benchmark/trmv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#undef TRMV + +#ifndef COMPLEX + +#ifdef DOUBLE +#define TRMV BLASFUNC(dtrmv) +#else +#define TRMV BLASFUNC(strmv) +#endif + +#else + +#ifdef DOUBLE +#define TRMV BLASFUNC(ztrmv) +#else +#define TRMV BLASFUNC(ctrmv) +#endif + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) +{ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1) { + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *a, *x; + char *p; + + char uplo ='U'; + char trans='N'; + char diag ='U'; + + int loops = 1; + int l; + blasint inc_x=1; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + long n, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from, + to, step, uplo, trans, diag, loops, inc_x); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(n = from; n <= to; n += step) { + timeg=0; + + fprintf(stderr, " %6d : ", (int)n); + for(j = 0; j < n; j++) { + for(i = 0; i < n * COMPSIZE; i++) { + a[i + j * n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + clock_gettime(CLOCK_REALTIME, &start); + TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x); + clock_gettime(CLOCK_REALTIME, &stop); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + timeg += time1; + } + + timeg /= loops; + fprintf(stderr, " %10.2f MFlops %12.9f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From 2afc0748039d5adaede2780e2a30e628a843a0a1 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 3 Mar 2020 12:35:10 -0600 Subject: [PATCH 11/20] Fix DYNAMIC_ARCH build for POWER9 Setting DYNAMIC_ARCH=1 on POWER9 does not build POWER9 files due to some compiler version checks. This patch fixes some of the macros that are used to check compiler version. On fixing those checks, there are some new make failures related to icamin, icamax, isamin, isamax and caxpy files on POWER9. This patch fixes those failures as well. --- Makefile.system | 2 +- driver/others/dynamic_power.c | 8 ++++---- kernel/power/caxpy_power9.S | 4 ++++ kernel/power/icamax_power9.S | 7 +++++++ kernel/power/icamin_power9.S | 7 +++++++ kernel/power/isamax_power9.S | 7 +++++++ kernel/power/isamin_power9.S | 7 +++++++ 7 files changed, 37 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 1e30d05a8..829c08f16 100644 --- a/Makefile.system +++ b/Makefile.system @@ -327,7 +327,6 @@ ifeq ($(C_COMPILER), GCC) #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) @@ -575,6 +574,7 @@ ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 endif ifeq ($(C_COMPILER), GCC) +GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) ifeq ($(GCCVERSIONGT5), 1) DYNAMIC_CORE += POWER9 else diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 1dec5f4b3..8c831b998 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -3,7 +3,7 @@ extern gotoblas_t gotoblas_POWER6; extern gotoblas_t gotoblas_POWER8; -#if (!defined C_GCC) || (GCC_VERSION >= 60000) +#if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif @@ -21,7 +21,7 @@ static char *corename[] = { char *gotoblas_corename(void) { if (gotoblas == &gotoblas_POWER6) return corename[1]; if (gotoblas == &gotoblas_POWER8) return corename[2]; -#if (!defined C_GCC) || (GCC_VERSION >= 60000) +#if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; #endif return corename[0]; @@ -33,7 +33,7 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER6; if (__builtin_cpu_is("power8")) return &gotoblas_POWER8; -#if (!defined C_GCC) || (GCC_VERSION >= 60000) +#if (!defined __GNUC__) || ( __GNUC__ >= 6) if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; #endif @@ -59,7 +59,7 @@ static gotoblas_t *force_coretype(char * coretype) { { case 1: return (&gotoblas_POWER6); case 2: return (&gotoblas_POWER8); -#if (!defined C_GCC) || (GCC_VERSION >= 60000) +#if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); #endif default: return NULL; diff --git a/kernel/power/caxpy_power9.S b/kernel/power/caxpy_power9.S index 844cacd50..b4733ff9f 100644 --- a/kernel/power/caxpy_power9.S +++ b/kernel/power/caxpy_power9.S @@ -13,7 +13,11 @@ PROLOGUE +#ifdef CONJ +caxpyc_k: +#else caxpy_k: +#endif .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l diff --git a/kernel/power/icamax_power9.S b/kernel/power/icamax_power9.S index 2968b3f8b..bf6ab6e82 100644 --- a/kernel/power/icamax_power9.S +++ b/kernel/power/icamax_power9.S @@ -1,3 +1,4 @@ +/* .file "icamax.c" .abiversion 2 .section ".text" @@ -5,6 +6,12 @@ .p2align 4,,15 .globl icamax_k .type icamax_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + icamax_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha diff --git a/kernel/power/icamin_power9.S b/kernel/power/icamin_power9.S index 8eaa79f33..58a3c53a3 100644 --- a/kernel/power/icamin_power9.S +++ b/kernel/power/icamin_power9.S @@ -1,3 +1,4 @@ +/* .file "icamin.c" .abiversion 2 .section ".text" @@ -5,6 +6,12 @@ .p2align 4,,15 .globl icamin_k .type icamin_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + icamin_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha diff --git a/kernel/power/isamax_power9.S b/kernel/power/isamax_power9.S index 9df1e773c..259c996fc 100644 --- a/kernel/power/isamax_power9.S +++ b/kernel/power/isamax_power9.S @@ -1,3 +1,4 @@ +/* .file "isamax.c" .abiversion 2 .section ".text" @@ -5,6 +6,12 @@ .p2align 4,,15 .globl isamax_k .type isamax_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + isamax_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha diff --git a/kernel/power/isamin_power9.S b/kernel/power/isamin_power9.S index 0475edf46..36486ff02 100644 --- a/kernel/power/isamin_power9.S +++ b/kernel/power/isamin_power9.S @@ -1,3 +1,4 @@ +/* .file "isamin.c" .abiversion 2 .section ".text" @@ -5,6 +6,12 @@ .p2align 4,,15 .globl isamin_k .type isamin_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + isamin_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha From 635c9e4e098415266593341df3575d7401295800 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Mar 2020 21:04:12 +0100 Subject: [PATCH 12/20] Restore initializers for mutex and conditional --- driver/others/blas_server.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index ce028a7fc..3d2d5ef7a 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -571,6 +571,9 @@ int blas_thread_init(void){ atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; + pthread_mutex_init(&thread_status[i].lock, NULL); + pthread_cond_init (&thread_status[i].wakeup, NULL) + #ifdef NEED_STACKATTR ret=pthread_create(&blas_threads[i], &attr, &blas_thread_server, (void *)i); From d68e4ba59bd71f0515e27fc8ce2eb1fd9c94f63f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Mar 2020 21:37:48 +0100 Subject: [PATCH 13/20] Fix cut/paste glitch --- driver/others/blas_server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 3d2d5ef7a..aa0644845 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -572,7 +572,7 @@ int blas_thread_init(void){ thread_status[i].status = THREAD_STATUS_WAKEUP; pthread_mutex_init(&thread_status[i].lock, NULL); - pthread_cond_init (&thread_status[i].wakeup, NULL) + pthread_cond_init (&thread_status[i].wakeup, NULL); #ifdef NEED_STACKATTR ret=pthread_create(&blas_threads[i], &attr, From 114dbec947277f806b73b0e8392849d07847d40c Mon Sep 17 00:00:00 2001 From: Darkness303 <1010287144@qq.com> Date: Wed, 4 Mar 2020 14:09:10 +0800 Subject: [PATCH 14/20] 1.Add syr2 benchmark 2.Fixed some errors --- benchmark/Makefile | 47 ++++++++++- benchmark/syr.c | 4 +- benchmark/syr2.c | 194 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 241 insertions(+), 4 deletions(-) create mode 100644 benchmark/syr2.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 1d4a220e4..b9ffbf381 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -57,6 +57,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -85,6 +86,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -111,7 +113,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - ssyr.goto dsyr.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -140,6 +143,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -167,6 +171,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -194,6 +199,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -221,6 +227,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -251,6 +258,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -289,6 +297,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ ssyr.veclib dsyr.veclib \ + ssyr2.veclib dsyr2.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ @@ -807,6 +816,36 @@ dsyr.mkl : dsyr.$(SUFFIX) dsyr.veclib : dsyr.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Ssyr2 #################################################### +ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2.acml : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.atlas : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.mkl : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.veclib : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr2 #################################################### +dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2.acml : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.atlas : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.mkl : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.veclib : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) @@ -2123,6 +2162,12 @@ ssyr.$(SUFFIX) : syr.c dsyr.$(SUFFIX) : syr.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +ssyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/syr.c b/benchmark/syr.c index 91b5b5904..458bc6edb 100644 --- a/benchmark/syr.c +++ b/benchmark/syr.c @@ -173,11 +173,9 @@ int main(int argc, char *argv[]){ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - gettimeofday( &start, (struct timezone *)0); - fprintf(stderr, " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); + COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / time1 * 1.e-6); } diff --git a/benchmark/syr2.c b/benchmark/syr2.c new file mode 100644 index 000000000..0129dd09a --- /dev/null +++ b/benchmark/syr2.c @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SYR2 + + +#ifdef DOUBLE +#define SYR2 BLASFUNC(dsyr2) +#else +#define SYR2 BLASFUNC(ssyr2) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *a; + FLOAT alpha[] = {1.0, 1.0}; + char *p; + + char uplo='U'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + blasint m, i, j; + blasint inc_x= 1; + blasint inc_y= 1; + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d Inc_y = %d\n", from, to, step,uplo,inc_x,inc_y); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + fprintf(stderr, " %6d : ", (int)m); + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + gettimeofday( &start, (struct timezone *)0); + + SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + fprintf(stderr, + " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6); + + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From de74e116411b230a6e4b36d06053fe468738f6c8 Mon Sep 17 00:00:00 2001 From: q00437336 Date: Wed, 4 Mar 2020 02:57:33 -0500 Subject: [PATCH 15/20] add benchmark for trsv --- benchmark/Makefile | 87 ++++++++++++++++++ benchmark/trsv.c | 222 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 309 insertions(+) create mode 100644 benchmark/trsv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index c037dd6d6..cf8ab3416 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -74,6 +74,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ @@ -102,6 +103,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -131,6 +133,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -159,6 +162,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -188,6 +192,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ @@ -215,6 +220,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -244,6 +250,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -274,6 +281,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -313,6 +321,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ + strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ @@ -1183,6 +1192,72 @@ ztrmv.mkl : ztrmv.$(SUFFIX) ztrmv.veclib : ztrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Strsv #################################################### +strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsv.acml : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.atlas : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.mkl : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.veclib : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsv #################################################### +dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsv.acml : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.atlas : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.mkl : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.veclib : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsv #################################################### + +ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsv.acml : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.atlas : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.mkl : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.veclib : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsv #################################################### + +ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsv.acml : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.atlas : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.mkl : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.veclib : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2264,6 +2339,18 @@ ctrmv.$(SUFFIX) : trmv.c ztrmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +strsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/trsv.c b/benchmark/trsv.c new file mode 100644 index 000000000..8652eb331 --- /dev/null +++ b/benchmark/trsv.c @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include +#include "common.h" + + +#undef GEMV +#undef TRSV + +#ifndef COMPLEX + +#ifdef DOUBLE +#define TRSV BLASFUNC(dtrsv) +#else +#define TRSV BLASFUNC(strsv) +#endif + +#else + +#ifdef DOUBLE +#define TRSV BLASFUNC(ztrsv) +#else +#define TRSV BLASFUNC(ctrsv) +#endif + +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x; + blasint n = 0, i, j; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timespec time_start, time_end; + time_t seconds = 0; + + double time1,timeg; + long long nanos = 0; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + char uplo ='L'; + char transa = 'N'; + char diag ='U'; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_TRANSA"))) transa=*p; + if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Transa = '%c' Inc_x = %d uplo=%c diag=%c loop = %d\n", from, to, step,transa,inc_x, + uplo,diag,loops); + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + fprintf(stderr, "============================================\n"); + + for(n = from; n <= to; n += step) + { + timeg=0; + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * n * n * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * n * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + for(j = 0; j < n; j++){ + for(i = 0; i < n * COMPSIZE; i++){ + a[i + j * n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for(l =0;l< loops;l++){ + + clock_gettime(CLOCK_REALTIME,&time_start); + + TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x); + + clock_gettime(CLOCK_REALTIME,&time_end); + nanos = time_end.tv_nsec - time_start.tv_nsec; + seconds = time_end.tv_sec - time_start.tv_sec; + + time1 = seconds + nanos /1.e9; + timeg += time1; + } + + + timeg /= loops; + long long muls = n*(n+1)/2.0; + long long adds = (n - 1.0)*n/2.0; + + fprintf(stderr, "%10d %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg); + if(a != NULL){ + free(a); + } + + if( x != NULL){ + free(x); + } + + } + + return 0; +} + From 233838b4bca6dbc08a6db3899629b7393b10fc19 Mon Sep 17 00:00:00 2001 From: q00437336 Date: Wed, 4 Mar 2020 03:54:40 -0500 Subject: [PATCH 16/20] change clock to CLOCK_PROCESS_CPUTIME_ID --- benchmark/trsv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/trsv.c b/benchmark/trsv.c index 8652eb331..c60890de4 100644 --- a/benchmark/trsv.c +++ b/benchmark/trsv.c @@ -189,11 +189,11 @@ int main(int argc, char *argv[]){ for(l =0;l< loops;l++){ - clock_gettime(CLOCK_REALTIME,&time_start); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start); TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x); - clock_gettime(CLOCK_REALTIME,&time_end); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end); nanos = time_end.tv_nsec - time_start.tv_nsec; seconds = time_end.tv_sec - time_start.tv_sec; From 0f1a2b12f91b83c39721cbf1585bafa5fa1663fc Mon Sep 17 00:00:00 2001 From: s00527847 Date: Wed, 4 Mar 2020 15:50:19 -0500 Subject: [PATCH 17/20] add benchmark for spr/spr2 --- benchmark/Makefile | 96 ++++++++++++++++++++- benchmark/spr.c | 198 +++++++++++++++++++++++++++++++++++++++++++ benchmark/spr2.c | 207 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 500 insertions(+), 1 deletion(-) create mode 100755 benchmark/spr.c create mode 100755 benchmark/spr2.c diff --git a/benchmark/Makefile b/benchmark/Makefile index c037dd6d6..660f44fee 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -57,6 +57,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyr.goto dsyr.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -86,6 +88,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyr.acml dsyr.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -113,7 +117,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - ssyr.goto dsyr.atlas \ + ssyr.atlas dsyr.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -143,6 +149,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyr.mkl dsyr.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -171,6 +179,8 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyr.goto dsyr.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -199,6 +209,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyr.acml dsyr.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -227,6 +239,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ ssyr.atlas dsyr.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -258,6 +272,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyr.mkl dsyr.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -297,6 +313,8 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ ssyr.veclib dsyr.veclib \ + sspr.veclib dspr.veclib \ + sspr2.veclib dspr2.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ @@ -816,6 +834,70 @@ dsyr.mkl : dsyr.$(SUFFIX) dsyr.veclib : dsyr.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr #################################################### +sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr.acml : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.atlas : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.mkl : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.veclib : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr #################################################### +dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr.acml : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.atlas : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.mkl : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.veclib : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr2 #################################################### +sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr2.acml : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.atlas : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.mkl : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.veclib : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr2 #################################################### +dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr2.acml : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.atlas : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.mkl : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.veclib : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) @@ -2197,6 +2279,18 @@ ssyr.$(SUFFIX) : syr.c dsyr.$(SUFFIX) : syr.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/spr.c b/benchmark/spr.c new file mode 100755 index 000000000..61a972c08 --- /dev/null +++ b/benchmark/spr.c @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SPR + +#ifdef DOUBLE +#define SPR BLASFUNC(dspr) +#else +#define SPR BLASFUNC(sspr) +#endif + + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a,*c; + FLOAT alpha[] = {1.0, 1.0}; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + char uplo='U'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + blasint m, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d\n", from, to, step,uplo,inc_x); + + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time\n"); + + for(m = from; m <= to; m += step) + { + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SPR2 + +#ifdef DOUBLE +#define SPR2 BLASFUNC(dspr2) +#else +#define SPR2 BLASFUNC(sspr2) +#endif + + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a,*b,*c; + FLOAT alpha[] = {1.0, 1.0}; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + char uplo='U'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + blasint m, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d Inc_y = %d\n", from, to, step,uplo,inc_x,inc_y); + + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time\n"); + + for(m = from; m <= to; m += step) + { + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + for (l=0; l Date: Thu, 5 Mar 2020 09:55:16 +0800 Subject: [PATCH 18/20] Add benchmark file rotm.c and modify benchmark/Makefile to test s/drotm modified: benchmark/Makefile new file: benchmark/rotm.c --- benchmark/Makefile | 44 ++++++++++ benchmark/rotm.c | 210 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 benchmark/rotm.c diff --git a/benchmark/Makefile b/benchmark/Makefile index c037dd6d6..699670e33 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -62,6 +62,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ srot.goto drot.goto \ + srotm.goto drotm.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -90,6 +91,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ + srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -118,6 +120,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ + srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -147,6 +150,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl \ + srotm.mkl drotm.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -176,6 +180,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ srot.goto drot.goto \ + srotm.goto drotm.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -203,6 +208,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ + srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -231,6 +237,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ + srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -262,6 +269,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ + srotm.atlas drotm.atlas \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -301,6 +309,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + srotm.veclib drotm.veclib \ saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ @@ -1639,6 +1648,37 @@ drot.mkl : drot.$(SUFFIX) drot.veclib : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### srotm #################################################### +srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srotm.acml : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.atlas : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.mkl : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.veclib : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### drotm #################################################### +drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drotm.acml : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.atlas : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.mkl : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.veclib : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) @@ -2432,7 +2472,11 @@ srot.$(SUFFIX) : rot.c drot.$(SUFFIX) : rot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +srotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ +drotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ diff --git a/benchmark/rotm.c b/benchmark/rotm.c new file mode 100644 index 000000000..8dea2d08c --- /dev/null +++ b/benchmark/rotm.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#undef ROTM + +#ifdef DOUBLE +#define ROTM BLASFUNC(drotm) +#else +#define ROTM BLASFUNC(srotm) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz) +{ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) +{ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid = + shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT | 0600)) < 0) { + printf("Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1) { + printf("Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x, *y; + // FLOAT result; + blasint m, i; + blasint inc_x = 1, inc_y = 1; + FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) + inc_y = atoi(p); + + fprintf( + stderr, + "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", + from, to, step, inc_x, inc_y, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == + NULL) { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + + if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == + NULL) { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) { + + timeg = 0; + + fprintf(stderr, " %6d : ", (int)m); + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + gettimeofday(&start, (struct timezone *)0); + + ROTM(&m, x, &inc_x, y, &inc_y, param); + + gettimeofday(&stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + timeg += time1; + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} From 32c847df45c43a043f720049eada5d727026e26c Mon Sep 17 00:00:00 2001 From: chenxuqiang Date: Fri, 6 Mar 2020 01:02:02 -0500 Subject: [PATCH 19/20] benchmark/hpmv&hbmv: add benchmark/hpmv.c and benchmark/hbmv.c Signed-off-by: Xuqiang Chen chenxuqiang3@hisilicon.com --- benchmark/Makefile | 93 ++++++++++++++++++++ benchmark/hbmv.c | 210 +++++++++++++++++++++++++++++++++++++++++++++ benchmark/hpmv.c | 207 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 510 insertions(+) create mode 100644 benchmark/hbmv.c create mode 100644 benchmark/hpmv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index c037dd6d6..c554a57b1 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -69,6 +69,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ @@ -97,6 +99,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ @@ -125,6 +129,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ @@ -154,6 +160,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ @@ -183,6 +191,8 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto \ chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ @@ -210,6 +220,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ @@ -238,6 +250,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ @@ -269,6 +283,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ @@ -308,6 +324,8 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sasum.veclib dasum.veclib casum.veclib zasum.veclib \ ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ chemv.veclib zhemv.veclib \ + chbmv.veclib zhbmv.veclib \ + chpmv.veclib zhpmv.veclib \ chemm.veclib zhemm.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ @@ -1542,7 +1560,70 @@ zhemv.mkl : zhemv.$(SUFFIX) zhemv.veclib : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chbmv #################################################### +chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chbmv.acml : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.atlas : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.mkl : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.veclib : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhbmv #################################################### + +zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhbmv.acml : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.atlas : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.mkl : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.veclib : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chpmv #################################################### + +chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chpmv.acml : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.atlas : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.mkl : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.veclib : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhpmv #################################################### + +zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhpmv.acml : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.atlas : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.mkl : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.veclib : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2331,6 +2412,18 @@ chemv.$(SUFFIX) : hemv.c zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +chbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sdot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c new file mode 100644 index 000000000..b9dcc03bb --- /dev/null +++ b/benchmark/hbmv.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef HBMV + + +#ifdef DOUBLE +#define HBMV BLASFUNC(zhbmv) +#else +#define HBMV BLASFUNC(chbmv) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz) { + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) { + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {0.0, 0.0}; + blasint k = 1; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1, inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_K"))) k = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", + from, to, step, uplo, k, inc_x, inc_y, loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++) { + for(i = 0; i < m * COMPSIZE; i++) { + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l = 0; l < loops; l++) { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + gettimeofday( &start, (struct timezone *)0); + + HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + timeg += time1; + + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c new file mode 100644 index 000000000..6e6634fcf --- /dev/null +++ b/benchmark/hpmv.c @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef HPMV + + +#ifdef DOUBLE +#define HPMV BLASFUNC(zhpmv) +#else +#define HPMV BLASFUNC(chpmv) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz) { + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) { + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1, inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++) { + for(i = 0; i < m * COMPSIZE; i++) { + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l = 0; l < loops; l++) { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + gettimeofday( &start, (struct timezone *)0); + + HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + timeg += time1; + + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From 208c7e7ca50a8bfdfabbec750bdc538023c94aed Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Mon, 24 Feb 2020 05:45:30 +0000 Subject: [PATCH 20/20] Use acq/rel semantics to pass flags/pointers in getrf_parallel. The current implementation has locks, but the locks each only have a critical section of one variable so atomic reads/writes with barriers can be used to achieve the same behavior. Like the previous patch, pthread_mutex_lock isn't fair, so in a tight loop the previous thread that has the lock can keep it starving another thread, even if that thread is about to write the data that will stop the current thread from spinning. On a 64c Arm system this improves performance by 20x on sgesv.goto. --- lapack/getrf/getrf_parallel.c | 119 +++++++++++++--------------------- 1 file changed, 44 insertions(+), 75 deletions(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index c82defcab..c602822a8 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -68,23 +68,14 @@ double sqrt(double); #define GETRF_FACTOR 1.00 -#if defined(USE_PTHREAD_LOCK) -static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER; -#elif defined(USE_PTHREAD_SPINLOCK) -static pthread_spinlock_t getrf_lock = 0; +#if (__STDC_VERSION__ >= 201112L) +#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED) +#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) #else -static BLASULONG getrf_lock = 0UL; +#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p)) +#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v) #endif -#if defined(USE_PTHREAD_LOCK) -static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER; -#elif defined(USE_PTHREAD_SPINLOCK) -static pthread_spinlock_t getrf_flag_lock = 0; -#else -static BLASULONG getrf_flag_lock = 0UL; -#endif - - static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { @@ -119,11 +110,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb = sb; -#if __STDC_VERSION__ >= 201112L - _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; -#else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; -#endif blasint *ipiv = (blasint *)args -> c; @@ -180,7 +167,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra } } - if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) flag[mypos * CACHE_LINE_SIZE] = 0; + if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) { + MB; + atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0); + } for (is = 0; is < m; is += GEMM_P){ min_i = m - is; @@ -201,14 +191,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra /* Non blocking implementation */ typedef struct { -#if __STDC_VERSION__ >= 201112L - _Atomic -#else - volatile -#endif - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; + #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); @@ -246,11 +232,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * blasint *ipiv = (blasint *)args -> c; BLASLONG jw; -#if __STDC_VERSION__ >= 201112L - _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; -#else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; -#endif + if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); @@ -280,10 +263,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * #if 1 { do { - LOCK_COMMAND(&getrf_lock); - jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; - UNLOCK_COMMAND(&getrf_lock); + jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside]); } while (jw); + MB; } #else while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; @@ -326,21 +308,17 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * } MB; for (i = 0; i < args -> nthreads; i++) { - LOCK_COMMAND(&getrf_lock); - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - UNLOCK_COMMAND(&getrf_lock); + atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]); } } - LOCK_COMMAND(&getrf_flag_lock); - flag[mypos * CACHE_LINE_SIZE] = 0; - UNLOCK_COMMAND(&getrf_flag_lock); + MB; + atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0); if (m == 0) { + MB; for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { - LOCK_COMMAND(&getrf_lock); - job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; - UNLOCK_COMMAND(&getrf_lock); + atomic_store_long(&job[mypos].working[mypos][CACHE_LINE_SIZE * xxx], 0); } } @@ -366,10 +344,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * if ((current != mypos) && (!is)) { #if 1 do { - LOCK_COMMAND(&getrf_lock); - jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; - UNLOCK_COMMAND(&getrf_lock); - } while (jw == 0); + jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]); + } while (jw == 0); + MB; #else while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; #endif @@ -381,9 +358,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * MB; if (is + min_i >= m) { - LOCK_COMMAND(&getrf_lock); - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; - UNLOCK_COMMAND(&getrf_lock); + atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], 0); } } @@ -397,10 +372,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { #if 1 do { - LOCK_COMMAND(&getrf_lock); - jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; - UNLOCK_COMMAND(&getrf_lock); + jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE *xxx]); } while(jw != 0); + MB; #else while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; #endif @@ -443,12 +417,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #ifdef _MSC_VER BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; #else -#if __STDC_VERSION__ >= 201112L - _Atomic -#else - volatile -#endif - BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); + volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #endif #ifndef COMPLEX @@ -543,7 +512,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (width > mn - is - bk) width = mn - is - bk; } - if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]); + + if (num_cpu > 0) { + WMB; + exec_blas_async_wait(num_cpu, &queue[0]); + } mm = m - bk - is; nn = n - bk - is; @@ -608,7 +581,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - flag[num_cpu * CACHE_LINE_SIZE] = 1; + atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1); num_cpu ++; @@ -637,6 +610,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (num_cpu > 0) { queue[num_cpu - 1].next = NULL; + WMB; + exec_blas_async(0, &queue[0]); inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); @@ -647,14 +622,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, for (i = 0; i < num_cpu; i ++) { #if 1 - LOCK_COMMAND(&getrf_flag_lock); - f=flag[i*CACHE_LINE_SIZE]; - UNLOCK_COMMAND(&getrf_flag_lock); - while (f!=0) { - LOCK_COMMAND(&getrf_flag_lock); - f=flag[i*CACHE_LINE_SIZE]; - UNLOCK_COMMAND(&getrf_flag_lock); - }; + do { + f = atomic_load_long(&flag[i*CACHE_LINE_SIZE]); + } while (f != 0); + MB; #else while (flag[i*CACHE_LINE_SIZE]) {}; #endif @@ -719,12 +690,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, nn, num_cpu; -#if __STDC_VERSION__ >= 201112L - _Atomic -#else - volatile -#endif - BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); + volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #ifndef COMPLEX #ifdef XDOUBLE @@ -833,6 +799,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, nn = n - bk - is; if (width > nn) width = nn; + WMB; + if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]); range[0] = 0; @@ -867,7 +835,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - flag[num_cpu * CACHE_LINE_SIZE] = 1; + atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1); num_cpu ++; } @@ -882,6 +850,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range_n_new[0] = offset + is; range_n_new[1] = offset + is + bk; + WMB; if (num_cpu > 1) { exec_blas_async(1, &queue[1]); @@ -917,7 +886,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #endif - for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; + for (i = 1; i < num_cpu; i ++) while (atomic_load_long(&flag[i * CACHE_LINE_SIZE])) {}; TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);