Compare commits

...

440 Commits

Author SHA1 Message Date
Zhang Xianyi
92058a75e2 For gemm multi-threading, simply split M.
e.g.
layer 1: A (1600k, 576), B(576, 64)

B is very small. We split M.
2015-11-25 05:14:56 +08:00
Zhang Xianyi
da7f69e8f4 Refs #699. Split the obj list of LAPACKE 3.6.0. 2015-11-24 13:15:28 -06:00
Zhang Xianyi
044fb91ea5 Merge pull request #690 from rayglover/msvc-fix
(Visual Studio) Don't use C99 complex numbers when building C++ code.
2015-11-23 11:05:37 -06:00
Zhang Xianyi
b4380acf77 Merge pull request #696 from ashwinyes/develop_20151120_lapack_test_fixes
Cortex A57 fixes and Lapack 3.6.0
2015-11-23 11:04:42 -06:00
Werner Saar
d1dd4e302e fix for bad or outdated mingw compiler 2015-11-23 16:20:14 +01:00
Ashwin Sekhar T K
318f0949c3 lapack-test fixes in nrm2 kernels for Cortex A57 2015-11-23 13:43:36 +05:30
Werner Saar
299cdcdc29 lapack fixes for Windos 2015-11-21 14:33:27 +01:00
Werner Saar
a8516c5b47 fixes for cross compile 2015-11-21 10:48:37 +01:00
Werner Saar
c40538eaeb bugfix for cross compiling 2015-11-20 13:47:22 +01:00
Werner Saar
33e37d01b3 added lapack-3.6.0 2015-11-20 09:45:46 +01:00
Werner Saar
64db4576e6 removed lapack-3.5.0 2015-11-20 09:41:59 +01:00
Werner Saar
0d22551a6b increase the stack size limit in the constructor 2015-11-20 09:23:01 +01:00
Ashwin Sekhar T K
1d121852c1 Fix blas_lock for arm64 2015-11-20 01:45:35 +05:30
Ashwin Sekhar T K
98965da2e8 lapack-test fixes for Cortex A57 2015-11-20 01:15:04 +05:30
Ashwin Sekhar T K
39937d15cd Change BUFFER_SIZE for Cortex A57 to 20 MB
Change the GEMM_P, GEMM_Q, GEMM_R values for Cortex A57
2015-11-20 01:12:04 +05:30
Ray Glover
a9d7eee0dc (Visual Studio) Don't use C99 complex numbers when building C++ code. 2015-11-17 17:29:30 +00:00
Zhang Xianyi
e31948ceb0 Fix #686. Merge branch 'ashwinyes-develop' into develop 2015-11-11 04:30:26 +08:00
Zhang Xianyi
233ec2a1cc Use 40 MB buffer for ARM Cortex A57. 2015-11-11 04:22:34 +08:00
Zhang Xianyi
a4c6a88a65 Delete vi swap file. 2015-11-11 04:19:43 +08:00
Zhang Xianyi
faf0811483 Merge branch 'develop' of https://github.com/ashwinyes/OpenBLAS into ashwinyes-develop 2015-11-11 04:16:22 +08:00
Zhang Xianyi
4e4a3e783f Update develop version. 2015-11-11 04:14:58 +08:00
Zhang Xianyi
d00ada378f Merge pull request #684 from sebastien-villemot/develop
Fix detection of POWER architecture in c_check.
2015-11-09 11:39:21 -06:00
Sébastien Villemot
41407acc19 Fix detection of POWER architecture in c_check.
This is necessary to avoid the false detection of a cross-compiling
environment.
2015-11-09 18:36:04 +01:00
Ashwin Sekhar T K
67874468a6 Fix bug in benchmark/gemm.c 2015-11-09 14:15:54 +05:30
Ashwin Sekhar T K
c99c43d51e Optimized trmm kernels for CORTEXA57 2015-11-09 14:15:54 +05:30
Ashwin Sekhar T K
1397b47197 Optimized zgemm kernel for CORTEXA57 2015-11-09 14:15:53 +05:30
Ashwin Sekhar T K
45f78963ac Optimized cgemm kernel for CORTEXA57
Also, add a generic ztrmm 4x4 kernel
2015-11-09 14:15:53 +05:30
Ashwin Sekhar T K
402443bf9c Optimized dgemm kernel for CORTEXA57 2015-11-09 14:15:53 +05:30
Ashwin Sekhar T K
19fdbee291 Improve the sgemm kernel for CORTEXA57 2015-11-09 14:15:53 +05:30
Ashwin Sekhar T K
3b0cdfab1e Optimized gemv kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:52 +05:30
Ashwin Sekhar T K
46efa6a1da Optimized swap kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:52 +05:30
Ashwin Sekhar T K
ea1465cdf8 Optimized scal kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:52 +05:30
Ashwin Sekhar T K
fb4be3b3eb Optimized rot kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:52 +05:30
Ashwin Sekhar T K
6c2f4ddbcd Optimized nrm2 kernels for CORTEXA57 2015-11-09 14:15:51 +05:30
Ashwin Sekhar T K
870c4d49c0 Optimized dot kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:51 +05:30
Ashwin Sekhar T K
cd7684097c Optimized copy kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:51 +05:30
Ashwin Sekhar T K
2690b71b1f Optimized axpy kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:51 +05:30
Ashwin Sekhar T K
3e4acedf0e Optimized asum kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:51 +05:30
Ashwin Sekhar T K
2610752dbb Optimized iamax kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:50 +05:30
Ashwin Sekhar T K
dbb213655e Optimized amax kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:50 +05:30
Ashwin Sekhar T K
9742dba595 Fix compiler errors in common.h 2015-11-09 14:15:50 +05:30
Ashwin Sekhar T K
f2f8a0fe8b Adding arm64 target CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:50 +05:30
Ralph Campbell
55a0b27c01 Minor C code fixes in interface/ 2015-11-09 14:15:49 +05:30
Ralph Campbell
fbc21266e6 Minor C code fixes in driver/ 2015-11-09 14:15:49 +05:30
Ralph Campbell
c053559ed9 Minor C code fixes in kernel/arm 2015-11-09 14:15:49 +05:30
Ralph Campbell
55e4332f00 Remove duplicate -D args in kernel/Makefile.L1 2015-11-09 14:15:48 +05:30
Zhang Xianyi
a550431ee6 Refs #682. Enable LAPACK_COMPLEX_STRUCTURE when __ANDROID_API_ < 21. 2015-11-06 23:46:20 -06:00
Zhang Xianyi
839395fc25 Detect AMD Trinity and Richland. 2015-10-29 02:53:29 +08:00
Zhang Xianyi
1331642f24 Merge pull request #677 from j-bo/develop
Refs #676.  Fixed ONLY_CBLAS=1 compiling bug on windows.
2015-10-28 09:44:25 -05:00
j-bo
1e0bbea868 Refs #676. Fixed ONLY_CBLAS=1 compiling bug on windows. 2015-10-28 15:10:42 +01:00
Zhang Xianyi
53e849f4fc Merge branch 'develop' 2015-10-27 15:44:50 -05:00
Zhang Xianyi
8447498b50 Update doc for OpenBLAS 0.2.15 version. [CI skipped] 2015-10-27 15:44:35 -05:00
Zhang Xianyi
b6519159f5 Merge pull request #674 from j-bo/develop
Fix #673
2015-10-27 10:49:22 -05:00
Zhang Xianyi
63c56d3da9 Only include complex.h since Android 5.0 2015-10-27 10:47:55 -05:00
j-bo
718d0f18e3 Merge pull request #1 from j-bo/j-bo-patch-673
Fix #673
2015-10-27 13:56:45 +01:00
j-bo
6040858b22 Fix #673
Add lacking headers declarations when compiling for Android ARM7
2015-10-27 13:55:24 +01:00
Zhang Xianyi
70642fe4ed Refs #668. Raise the signal when pthread_create fails.
Thank James K. Lowden for the patch.
2015-10-26 19:02:51 -05:00
Zhang Xianyi
79d4a62e10 Add AppVeyor badge. 2015-10-26 18:14:41 -05:00
Zhang Xianyi
1ac8c32f1d [ci skip] Build Visual Studio 12 Win64 on Appveyor 2015-10-26 18:08:54 -05:00
Zhang Xianyi
0b2ad98e48 Only test x64 Windows CI. 2015-10-27 05:11:07 +08:00
Zhang Xianyi
69363622a8 Fix DYNAMIC_ARCH=1 bug. 2015-10-27 05:10:40 +08:00
Zhang Xianyi
e6d754fddc Use AppVeyor Windows CI. 2015-10-26 15:08:17 -05:00
Zhang Xianyi
f74ff6da38 Merge branch 'cmake' into develop 2015-10-26 14:54:59 -05:00
Zhang Xianyi
2feef49fa8 Merge branch 'develop' into cmake
Conflicts:
	driver/others/memory.c
2015-10-26 14:54:34 -05:00
Zhang Xianyi
53b6023a6c Fix cmake bug on MSVC 32-bit. 2015-10-26 14:52:13 -05:00
Zhang Xianyi
309875de3c Fix cmake bug on x86 32-bit.
e.g. Build 32-bit on 64-bit Linux.
cmake -DBINARY=32
2015-10-27 02:54:53 +08:00
Zhang Xianyi
b809f99cee Add CBLAS test for CMAKE. 2015-10-26 23:42:21 +08:00
Zhang Xianyi
5a291606ad Refs #671. the return of i?max cannot larger than N. 2015-10-24 01:16:34 +08:00
Zhang Xianyi
1ce054fcb3 Refs #669. Fixed the build bug with gcc on Mac OS X. 2015-10-22 11:07:35 -05:00
Zhang Xianyi
8fade093aa Fixed cmake bug on Visual Studio. 2015-10-20 14:37:22 -05:00
Zhang Xianyi
96f0bbe067 Fixed cmake bug on haswell. 2015-10-21 02:24:54 +08:00
Zhang Xianyi
d8392c1245 Fixe cmake config bugs. 2015-10-20 04:30:55 +08:00
Zhang Xianyi
aca7d7e953 Detect cmake test result. 2015-10-20 03:35:25 +08:00
Zhang Xianyi
94b125255f Merge branch 'develop' into cmake
Conflicts:
	driver/others/memory.c
2015-10-13 04:46:08 +08:00
Zhang Xianyi
3684706a12 Include time.h. 2015-10-08 15:18:54 +00:00
Zhang Xianyi
90aa8e24b9 Refs #615. Import bug fixes for LAPACKE dormlq. 2015-10-07 02:31:51 +08:00
Zhang Xianyi
11ac4665c8 Fixed #654. Make sure the gotoblas_init function is run before all other static initializations. 2015-10-05 14:14:32 -05:00
Zhang Xianyi
c666158b79 Merge pull request #656 from stevengj/libname
default to lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)
2015-10-05 10:25:15 -05:00
Zhang Xianyi
ccf581f94d Merge pull request #659 from Keno/patch-2
Fix cross compilation suffix detection
2015-10-05 10:23:52 -05:00
Keno Fischer
e9493f69eb Fix cross compilation suffix detection
If the path involves `-`, this would have otherwise detected this as a cross compile suffix.
2015-10-05 00:58:07 -04:00
Steven G. Johnson
88bef3bffc default to lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX), as discussed in #646: if you rename the symbols, it is best to rename the library 2015-10-01 15:07:04 -04:00
Zhang Xianyi
f27942a68a Fixed make TARGET=CORTEXA9 and CORTEXA15 bug. 2015-09-26 14:42:44 +00:00
Zhang Xianyi
0cc2b3de0b Merge pull request #652 from larsmans/fixes
Tiny fixes
2015-09-22 10:01:59 -05:00
Lars Buitinck
b9534bbd76 git ignore versioned .so files 2015-09-22 12:01:09 +02:00
Lars Buitinck
45c8b5e756 actually remove cblas_noconst.h
This file hasn't been used since 212463dce9.
2015-09-22 12:00:30 +02:00
Zhang Xianyi
a96a4cb012 Merge pull request #640 from kortschak/dlansy-fix
Fix LAPACK_*lansy routines
2015-09-10 10:36:57 -05:00
Zhang Xianyi
baec8f5cac Refs #638. Fixed compiling bug with clang on Mac OS X. 2015-09-10 10:32:07 -05:00
kortschak
d6e8459f20 Fix LAPACK_*lansy routines
Fixes #639.
2015-09-10 15:32:50 +09:30
Zhang Xianyi
dfe1eef33b Merge branch 'yuyichao-skylake-id' into develop 2015-09-09 10:48:15 -05:00
Zhang Xianyi
cc7cab8a45 Detect other Intel Skylake cores.
http://users.atw.hu/instlatx64/
2015-09-09 10:47:17 -05:00
Yichao Yu
61ae47eb99 Ref #632. Support Intel Skylake by Haswell kernels. 2015-09-09 11:07:33 -04:00
Zhang Xianyi
22353b1727 Merge pull request #634 from kortschak/lantr-trans-prep
Fix lantr preparation for row major matrices
2015-09-09 09:56:07 -05:00
kortschak
efffd28739 Fix lantr preparation for row major matrices 2015-09-09 09:25:48 +09:30
Zhang Xianyi
62cabef857 Merge pull request #633 from grisuthedragon/tune_imatcopy
Improved Ximatcopy when lda==ldb.
2015-09-08 13:59:08 -05:00
Martin Koehler
711ca33bc6 Improved Ximatcopy when lda==ldb.
The Ximatcopy functions create a copy of the input matrix
although they seem to work inplace. The new routines
XIMATCOPY_K_YY perform the operations inplace if the leading
dimension does not change.
2015-09-07 14:36:16 +02:00
Zhang Xianyi
40a3fed6b8 Merge pull request #630 from buffer51/develop
Fixed error in common.h for Android compilation introduced by e12cf11
2015-09-04 13:01:01 -05:00
buffer51
2297a2d989 Fixed error in common.h for Android compilation introduced by e12cf1123e 2015-09-03 20:54:21 -04:00
Zhang Xianyi
5408074941 Add notification. 2015-08-19 22:50:25 -05:00
Zhang Xianyi
bbcdf63bb4 Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2015-08-19 22:48:55 -05:00
Zhang Xianyi
43eabab62f Merge pull request #619 from gitter-badger/gitter-badge
Add a Gitter chat badge to README.md
2015-08-19 22:26:20 -05:00
The Gitter Badger
50901943fd Added Gitter badge 2015-08-20 03:21:09 +00:00
Zhang Xianyi
7df0820160 Use C kernels for s/dgemv on x86. 2015-08-19 08:07:47 -05:00
Zhang Xianyi
17ee2237c3 Fixed cmake bug with NO_LAPACK=1 2015-08-18 22:43:42 -05:00
Zhang Xianyi
4b7381b7a4 Merge pull request #617 from notaz/arm_fixes
really fix ARM64 locking
2015-08-17 15:22:37 -05:00
Grazvydas Ignotas
abade3f896 really fix ARM64 locking 2015-08-17 01:27:45 +02:00
Zhang Xianyi
d1349e7a11 Merge pull request #616 from notaz/arm_fixes
ARM fixes
2015-08-16 17:16:18 -05:00
Grazvydas Ignotas
3efeaed0d8 correct a minor mistake 2015-08-16 20:12:04 +02:00
Grazvydas Ignotas
d38a1ddc7a use real armv5 support
there is no more requirement for ARMv6 instructions,
and VFP on ARMv5 is uncommon
2015-08-16 18:59:18 +02:00
Grazvydas Ignotas
6b92204a7c add fallback blas_lock implementation
to be used on armv5 and new platforms
2015-08-16 18:59:17 +02:00
Grazvydas Ignotas
f2ac1a5cee set ARMV7 for Cortex-A9 and Cortex-A15
otherwise some macros like YIELDING are not defined correctly
2015-08-16 18:59:17 +02:00
Grazvydas Ignotas
e12cf1123e add fallback rpcc implementation
- use on arm, arm64 and any new platform
- use faster integer math instead of double
- use similar scale as rdtsc so that timeouts work
2015-08-16 18:59:16 +02:00
Grazvydas Ignotas
d3e2f0a1af add missing barriers
should fix issue #597
2015-08-16 15:37:02 +02:00
Grazvydas Ignotas
c2323dd4d2 really fix ARM locking
- was writing 0 to lock variable, so was ineffective
- only exit loop if both lock was 0 and strex was successful
2015-08-16 15:18:42 +02:00
Zhang Xianyi
f8eba3d548 Fixed cmake build bugs on Linux. 2015-08-11 16:25:16 -05:00
Zhang Xianyi
40ab5cfc50 Merge branch 'hpanderson_cmake' into cmake 2015-08-11 03:31:55 +08:00
Zhang Xianyi
b7a8f9ad47 Merge branch 'cmake' of https://github.com/hpanderson/OpenBLAS into hpanderson_cmake 2015-08-11 03:31:07 +08:00
Zhang Xianyi
f874465bb8 Use cmake to build OpenBLAS GENERIC Target on MSVC x86 64-bit.
Disable CBLAS and LAPACK.
2015-08-10 14:10:44 -05:00
Zhang Xianyi
bb6e050509 Merge pull request #614 from xantares/cmake_version
install OpenBLASConfigVersion.cmake
2015-08-06 13:15:51 -05:00
xantares
87336b9acf install OpenBLASConfigVersion.cmake 2015-08-06 20:03:50 +02:00
Hank Anderson
19664f3ef4 Added missing lapacke.cmake file. 2015-08-06 07:40:06 -05:00
Zhang Xianyi
c50661e5b7 Merge pull request #613 from fabioperez/develop
Add POWER7/POWER8 as targets
2015-08-05 09:19:17 -05:00
Fábio Perez
b8d64a856a Add POWER7/POWER8 as targets 2015-08-05 11:02:39 -03:00
Zhang Xianyi
898fc7552a Merge pull request #612 from ibmsoe/ppc64le
ppc64le platform support (ELF ABI v2)
2015-08-04 16:58:24 -05:00
Zhang Xianyi
ab0a0a75fc Merge branch 'develop' into cmake 2015-08-03 23:59:01 -05:00
Zhang Xianyi
1cf2b10224 Use pure C generic target on x86 and x86_64.
make TARGET=GENERIC

?gemm3m is unimplemented on generic target.
2015-08-03 23:55:56 -05:00
Zhang Xianyi
7ac7e147d4 Fixed cmake building bugs on Linux. Disable LAPACK by default. 2015-08-04 04:37:05 +08:00
Matthew Brandyberry
7ba4fe5afb ppc64le platform support (ELF ABI v2) 2015-07-21 22:20:19 -05:00
Zhang Xianyi
a55377e9a4 Merge branch 'hpanderson_cmake' into cmake 2015-07-22 04:07:27 +08:00
Zhang Xianyi
dcd5ba4443 Merge branch 'cmake' of https://github.com/hpanderson/OpenBLAS into hpanderson_cmake 2015-07-22 04:06:39 +08:00
Zhang Xianyi
3f1b57668e Fix blas lock bug on AArch64. 2015-06-26 11:54:41 +08:00
Zhang Xianyi
d8f18d32c3 Merge pull request #595 from tanderson92/fixTests
Fix test execution when USE_OPENMP=0
2015-06-22 21:54:51 -05:00
wernsaar
bdb5c842fc Merge pull request #596 from wernsaar/develop
optimizations for haswell
2015-06-13 16:44:48 +02:00
Werner Saar
e7c969e164 added optimized dtrmm_kernel for haswell 2015-06-13 16:16:29 +02:00
Werner Saar
9bd962f655 modified haswell parameter dgemm_unroll_n 2015-06-13 10:28:27 +02:00
Thomas Anderson
4f5691e5c0 Fix test execution when USE_OPENMP=0
The standard way to disable OpenMP support is to set USE_OPENMP=0,
as indicated by other checks to see if USE_OPENMP equals 1. The
problem is obviously then that `ifdef USE_OPENMP` is very much not
what we want to test for. This causes tests to fail when no OpenMP
library is installed.
2015-06-12 23:52:07 -07:00
Zhang Xianyi
29293160a4 Fix #593. Change MACOSX_DEPLOYMENT_TARGET to 10.6. 2015-06-08 10:53:50 -05:00
wernsaar
3e33afef2e Merge pull request #592 from wernsaar/develop
added benchmark scripts
2015-06-08 14:22:02 +02:00
Werner Saar
8614057ea9 added benchmark scripts for numpy, octave and R 2015-06-08 14:06:38 +02:00
Werner Saar
7f375f9e8f updated geev benchmark 2015-06-08 12:58:38 +02:00
wernsaar
69c5169e7d Merge pull request #589 from wernsaar/develop
small modification of gemm.c
2015-06-03 12:14:09 +02:00
Werner Saar
e19948baa1 small modification of gemm.c 2015-06-03 09:11:51 +02:00
wernsaar
a2eaf234fc Merge pull request #587 from wernsaar/develop
added gesv benchmark
2015-06-02 15:29:49 +02:00
Werner Saar
6a13a94e71 added gesv benchmark 2015-06-02 13:35:49 +02:00
wernsaar
eff43d3289 Merge pull request #585 from wernsaar/develop
bugfix for benchmark Makefile on MAC
2015-05-31 15:01:54 +02:00
Werner Saar
9c4817d07b bugfix for Makefile on mac 2015-05-31 14:16:51 +02:00
wernsaar
319f3a0451 Merge pull request #584 from wernsaar/develop
bugfixes, to build benchmarks with mingw on Windows OS
2015-05-29 13:27:20 +02:00
Werner Saar
02c7766f68 bugfixes, to build benchmarks with mingw on Windows OS 2015-05-29 12:56:22 +02:00
wernsaar
f38cb67ca8 Merge pull request #581 from wernsaar/develop
bugfix for arm locking
2015-05-23 12:58:15 +02:00
Werner Saar
eea2e30b74 bugfix for arm locking 2015-05-23 11:40:40 +02:00
Werner Saar
19b8fd2aed smp lock bugfix 2015-05-23 10:58:38 +02:00
wernsaar
0cc5212741 Merge pull request #580 from wernsaar/develop
added blas level1 swap  benchmark
2015-05-23 09:46:39 +02:00
Werner Saar
c47c8e8cf5 added blas level1 swap benchmark 2015-05-21 08:51:42 +02:00
Zhang Xianyi
a11555c715 Support Android NDK armeabi-v7a-hard ABI. (-mfloat-abi=hard)
e.g.
make HOSTCC=gcc CC=arm-linux-androideabi-gcc NO_LAPACK=1 TARGET=ARMV7

In Android NDK, it uses armeabi-v7a-hard ABI.
TARGET_CFLAGS += -mhard-float -D_NDK_MATH_NO_SOFTFP=1
TARGET_LDFLAGS += -Wl,--no-warn-mismatch -lm_hard
For more information, please check hard-float example at
android_ndk/tests/device/hard-float/jni/.
2015-05-20 21:57:27 -05:00
wernsaar
897d03518e Merge pull request #578 from wernsaar/develop
added blas level1 copy benchmark
2015-05-20 11:56:02 +02:00
Werner Saar
23fbc5728e added blas level1 copy benchmark 2015-05-20 11:05:00 +02:00
Zhang Xianyi
6d40fa587f Fix f_check bug. 2015-05-19 12:04:45 -05:00
wernsaar
22dcd79959 Merge pull request #577 from wernsaar/develop
Bugfix for armv6 memory barrier
2015-05-19 10:59:24 +02:00
Werner Saar
ea4df0aad3 Ref #574: Bugfix for armv6 memory barrier 2015-05-19 10:43:12 +02:00
Zhang Xianyi
e127fb8fd8 1) Refs #575. Remove g77 from compiler list.
2) If OpenBLAS cannot find Fortran compiler, it will only build BLAS
(without LAPACK).
2015-05-19 00:01:04 -05:00
wernsaar
7fb718a7d8 Merge pull request #572 from wernsaar/develop
added optimized cscal and zscal functions for steamroller
2015-05-18 13:47:38 +02:00
Werner Saar
24f58c8bb1 added optimized cscal and zscal kernels for steamroller 2015-05-18 12:40:07 +02:00
Werner Saar
95b1faf667 added optimized cscal and zscal kernels for steamroller and piledriver 2015-05-18 10:50:57 +02:00
Werner Saar
2d9e406050 added optimized cscal kernel for sandybridge 2015-05-18 08:46:06 +02:00
Werner Saar
59083e3ce1 added optimized cscal kernel for bulldozer 2015-05-18 07:33:52 +02:00
wernsaar
685be40339 Merge pull request #571 from wernsaar/develop
added optimized cscal and zscal functions
2015-05-17 14:09:14 +02:00
Werner Saar
31c9e399e9 added optimized cscal kernel for haswell 2015-05-17 13:44:09 +02:00
Werner Saar
7de6bb9889 added optimized zscal kernel for bulldozer 2015-05-17 11:45:19 +02:00
Werner Saar
d63034303b added optimized zscal kernel for haswell 2015-05-16 16:41:45 +02:00
Zhang Xianyi
51ff17d46e Add AMD Excavator target. 2015-05-13 16:16:30 -05:00
wernsaar
905534942a Merge pull request #568 from wernsaar/develop
added optimized dscal kernel
2015-05-13 13:48:08 +02:00
Werner Saar
18e90ee2e3 bugfix: added static to functions 2015-05-13 13:31:26 +02:00
Werner Saar
e00cccc41e added optimized dscal kernel for piledriver 2015-05-13 13:05:35 +02:00
Werner Saar
73f09bf64f optimized dscal kernel for increment != 1 2015-05-13 12:14:39 +02:00
Werner Saar
02e772c7e4 added optimized dscal kernel for haswell 2015-05-12 17:19:58 +02:00
Werner Saar
7aee913991 added optimized dscal kernel for sandybridge 2015-05-12 16:27:43 +02:00
Werner Saar
e50a933037 added optimized dscal kernel for bulldozer 2015-05-12 12:28:44 +02:00
Zhang Xianyi
5f9011d6ef Merge pull request #566 from powderluv/develop
Fix build with ALLOC_SHM=0 (Android NDK)
2015-05-11 20:59:12 -05:00
powderluv
ebb9eba987 Fix build with ALLOC_SHM=0 (Android NDK)
Refactor such that you can build with ALLOC_SHM=0. HughTLB
implicity depends on ALLOC_SHM=1. This patch allows
building for Android NDK r10d.
2015-05-10 00:10:26 -07:00
Zhang Xianyi
8e5a1083bb Refs #532. Improve gemv paralel with small m and large n case.
Splite the matrix and reduction.
2015-05-08 05:33:17 +08:00
Zhang Xianyi
6743beb748 Refs #565. Fix the bug of generate FEXTRALIB. 2015-05-07 13:06:53 +08:00
Zhang Xianyi
bcabf72c08 Refs #565. Merge branch 'andreasnoack-anj/bench' into develop 2015-05-07 12:52:14 +08:00
Andreas Noack
cda29f183b Add vecLib benchmarks 2015-05-06 21:52:34 -04:00
wernsaar
e52d36450a Merge pull request #564 from wernsaar/develop
Use only 1 thread in trsm if m or n < 2*GEMM_MULTITHREAD_THRESHOLD
2015-05-06 11:10:31 +02:00
Werner Saar
f8f2e261fe use only 1 thread if m or n < 2*GEMM_MULTITHREAD_THRESHOLD 2015-05-06 10:41:53 +02:00
Werner Saar
be3c843700 added loops to trsm.c 2015-05-06 09:21:19 +02:00
wernsaar
e6f57db846 Merge pull request #563 from wernsaar/develop
Bugfix for gemm3m tests
2015-05-05 12:13:35 +02:00
Werner Saar
9bfd267d51 bugfix for gemm3m tests 2015-05-05 11:58:59 +02:00
Werner Saar
924bc5372e removed gemm3m functions from normal checks 2015-05-05 11:39:43 +02:00
wernsaar
2b83a69650 Merge pull request #561 from wernsaar/develop
updated dgemv_n sgemv_n kernels
2015-05-04 11:11:13 +02:00
Werner Saar
133c11a156 updated dgemv_n kernel for nehalem 2015-04-30 14:38:06 +02:00
Werner Saar
30f52d53df optimized dgemv_n kernel for haswell 2015-04-30 12:11:39 +02:00
Zhang Xianyi
a124637329 Merge pull request #560 from sebastien-villemot/develop
Fix detection of ARM architectures in c_check.
2015-04-29 11:36:47 -05:00
Sébastien Villemot
642aaba2e0 Fix detection of ARM architectures in c_check.
This is necessary to avoid the false detection of a cross-compiling environment.
2015-04-29 18:14:21 +02:00
wernsaar
4c616173e4 Merge pull request #558 from wernsaar/develop
optimizations for sandybridge
2015-04-28 17:30:16 +02:00
Werner Saar
5e83d80725 optimized dger kernel for sandybridge 2015-04-28 16:58:11 +02:00
Werner Saar
b2e1797dc6 added optimized sger kernel for sandybridge 2015-04-28 15:33:38 +02:00
Werner Saar
e216f686cb optimized saxpy and daxpy for sandybridge 2015-04-28 10:18:32 +02:00
Zhang Xianyi
e42652f772 Merge pull request #554 from wernsaar/develop
added benchmarks for zgeru and cgeru
2015-04-25 08:11:36 -05:00
Werner Saar
e77db2af31 add benchmarks for zgeru and cgeru 2015-04-25 14:53:07 +02:00
Zhang Xianyi
37b00841ac Merge pull request #552 from jeromerobert/develop
gemv: Ensure stack buffer is large enough to handle memory alignment
2015-04-24 14:12:12 -05:00
Werner Saar
fc0e0391f3 bugfixes: replaced int with BLASLONG 2015-04-24 14:30:44 +02:00
wernsaar
da0f27b9ac Merge pull request #553 from wernsaar/develop
optimized some blas level1 kernels for increments != 1
2015-04-24 13:57:48 +02:00
Werner Saar
c22068c406 optimized sdot.c for increments != 1 2015-04-24 13:13:20 +02:00
Werner Saar
dee100d0e4 optimized saxpy.c for increments != 1 2015-04-24 11:52:59 +02:00
Werner Saar
0273966abb optimized daxpy kernel for increments != 1 2015-04-24 11:39:17 +02:00
Werner Saar
3a67daa954 optimized ddot.c for increments != 1 2015-04-24 10:56:55 +02:00
Jerome Robert
ab567d8443 gemv: Ensure stack buffer is large enough to handle memory alignment
Ref #478
2015-04-24 10:12:49 +02:00
wernsaar
3c09cea4b2 Merge pull request #550 from wernsaar/develop
added optimized ssymv kernels for haswell and sandybridge
2015-04-23 13:27:38 +02:00
Werner Saar
b4f2153dcd added optimized ssymv kernels for sandybridge 2015-04-23 12:19:24 +02:00
Werner Saar
1c4b0eeae3 added optimized ssymv kernels for haswell 2015-04-23 10:23:13 +02:00
wernsaar
406d9d64e9 Merge pull request #549 from wernsaar/develop
added optimized dsymv kernels for haswell and sandybridge
2015-04-22 12:36:13 +02:00
Werner Saar
1bec9abb9a added optimized dsymv kernels for sandybridge 2015-04-22 12:09:43 +02:00
Werner Saar
3814bf60d3 added optimized dsymv kernels for haswell 2015-04-22 10:42:50 +02:00
Zhang Xianyi
847e19c04e Refs #478,#482, Enable stack alloc for s/dgemv_t.(revert 9798491) 2015-04-20 23:22:40 -05:00
Werner Saar
46c7b4d5c8 added asum benchmark 2015-04-19 11:24:07 +02:00
Werner Saar
8e05d291b5 added scal benchmark 2015-04-18 08:41:41 +02:00
wernsaar
9da555e5f7 Merge pull request #546 from wernsaar/develop
added optimized zaxpy-kernels
2015-04-16 11:36:51 +02:00
Werner Saar
6d0db0151f added optimized zaxpy-kernels 2015-04-16 11:19:37 +02:00
Zhang Xianyi
37b9033c90 Merge pull request #543 from jeromerobert/develop
Fix a buffer overflow with MAX_STACK_ALLOC size in dgemv_t
2015-04-15 11:18:14 -05:00
wernsaar
59e7a518c6 Merge pull request #544 from wernsaar/develop
Optimized  caxpy-kernels
2015-04-15 17:04:02 +02:00
Werner Saar
13889515b3 added optimized caxpy-kernel for sandybridge 2015-04-15 16:29:25 +02:00
Werner Saar
248c9340c3 added optimized caxpy-kernel for haswell 2015-04-15 15:16:31 +02:00
Werner Saar
e9f33b4ca7 added optimized caxpy-kernel for steamroller 2015-04-15 13:49:23 +02:00
Werner Saar
f5d847122a updated caxpy_microk_bulldozer-2.c and caxpy.c 2015-04-15 11:59:38 +02:00
Jerome Robert
a4c96eca67 Fix a buffer overflow with MAX_STACK_ALLOC size in dgemv_t
Refs #478, #482, 9798481, fd9fd42
2015-04-15 11:46:48 +02:00
wernsaar
fb02cb0a41 Merge pull request #540 from wernsaar/develop
Optimized dot- and axpy-kernels
2015-04-14 15:53:09 +02:00
Werner Saar
baa0363ea2 add optimized ddot-kernel for piledriver 2015-04-14 15:09:13 +02:00
Werner Saar
34ba66606a add optimized daxpy-kernel for piledriver 2015-04-14 14:23:29 +02:00
Werner Saar
f615dc7603 added optimized saxpy kernel for steamroller 2015-04-14 09:09:39 +02:00
Werner Saar
331c417637 optimized saxpy for piledriver 2015-04-14 08:34:11 +02:00
Zhang Xianyi
6c3a0b5d46 Enable MAX_STACK_ALLOC by default. 2015-04-13 23:23:40 -05:00
Zhang Xianyi
fd9fd42936 Refs #478, #482. Fixed bug on previous commit. 2015-04-13 23:22:27 -05:00
Zhang Xianyi
9798481979 Refs #478, #482. Fix segfault bug for gemv_t with MAX_ALLOC_STACK flag.
For gemv_t, directly use malloc to create the buffer.
2015-04-13 19:45:27 -05:00
Werner Saar
d7a17ad85d optimized sdot-kernel for pilediver 2015-04-13 13:19:21 +02:00
Werner Saar
d35f6c63c2 add optimized daxpy-kernel for steamroller 2015-04-13 12:22:43 +02:00
Werner Saar
166d76e864 added optimized sdot-kernel for steamroller 2015-04-11 08:48:18 +02:00
Werner Saar
f9f127d838 added optimized ddot kernel for steamroller 2015-04-10 16:18:03 +02:00
wernsaar
62231ab337 Merge pull request #538 from wernsaar/develop
Added optimized cdot- and zdot-kernels
2015-04-10 16:03:37 +02:00
Werner Saar
3119def9a7 updated cdot and zdot 2015-04-10 11:10:31 +02:00
Werner Saar
33b332372a add optimized cdot- and zdot-kernel for sandybridge 2015-04-10 09:37:26 +02:00
Werner Saar
fd838c75bc add optimized cdot- and zdot-kernel for haswell 2015-04-09 15:13:52 +02:00
Werner Saar
b57a60dac8 updated cdot and zdot for piledriver 2015-04-09 10:33:46 +02:00
Werner Saar
5c51163972 added optimized cdot- and zdot-kernel for steamroller 2015-04-09 09:45:23 +02:00
Werner Saar
9299d8cfd6 added optimized cdot- and zdot-kernels for bulldozer 2015-04-08 16:29:55 +02:00
Zhang Xianyi
0a3d3b945d Refs #535. Fix the wrong vector instruction in sgemm sandy bridge kernel. 2015-04-08 03:55:49 +08:00
Zhang Xianyi
4f680a7d61 Merge pull request #534 from wernsaar/develop
Refs #533. added optimized saxpy- and daxpy-kernel for haswell and sandybridge
2015-04-07 12:48:11 -05:00
Werner Saar
ba926e807c added cdot- and zdot benchmark 2015-04-07 11:56:06 +02:00
Werner Saar
60c6dec6e6 updated some lines for bulldozer 2015-04-06 18:47:16 +02:00
Werner Saar
47898cca35 added optimized saxpy- and daxpy-kernel for sandybridge 2015-04-06 16:05:16 +02:00
Werner Saar
53bb924287 added optimized saxpy- and daxpy-kernel for haswell 2015-04-06 12:33:16 +02:00
Zhang Xianyi
1e80b8b0d3 Merge pull request #531 from wernsaar/develop
added optimized sdot- and ddot-kernels for Haswell and Sandybridge
2015-04-05 16:42:39 -05:00
Werner Saar
a901b065d3 added optimized ddot-kernel for sandybridge 2015-04-05 20:19:38 +02:00
Werner Saar
3937e2a0a0 add optimized sdot-kernel for sandybridge 2015-04-05 19:47:05 +02:00
Werner Saar
9707d608d5 removed double definition line 2015-04-05 18:35:34 +02:00
Werner Saar
701b9d7556 added optimized sdot- and ddot-kernel for HASWELL 2015-04-05 17:57:53 +02:00
Zhang Xianyi
8977b3f235 Refs #529. Support Intel Broadwell by Haswell kernels. 2015-04-02 11:08:03 -05:00
Zhang Xianyi
f6426395ea Merge pull request #527 from xantares/patch-1
fix mingw install
2015-03-30 10:16:11 -05:00
xantares
0ac787eefe fix mingw install 2015-03-30 09:30:55 +02:00
Zhang Xianyi
e5b96e55a7 Fix build bug for ARM64. 2015-03-24 15:27:17 -05:00
Zhang Xianyi
d0c51c4de9 Merge branch 'develop' 2015-03-24 15:07:07 -05:00
Zhang Xianyi
a3491e1e88 Update the doc for 0.2.14. 2015-03-24 15:05:59 -05:00
Zhang Xianyi
e81a5d61e4 Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2015-03-24 12:17:12 -05:00
Zhang Xianyi
c674fa32be Add ARM targets. 2015-03-24 12:17:04 -05:00
Zhang Xianyi
e34911a73d Fix compiling bug for ARM with setting BINARY. 2015-03-24 17:15:33 +00:00
Zhang Xianyi
76dcaf2281 Merge pull request #521 from maxlevesque/patch-1
Correct typo /proc/ instead of /pros/
2015-03-21 12:26:35 -05:00
Maximilien Levesque
770fac92eb Correct typo /proc/ instead of /pros/ 2015-03-20 23:25:11 +01:00
Zhang Xianyi
e95d64333a Refs #519. Avoid calling strncpy. 2015-03-19 15:57:22 -05:00
Zhang Xianyi
75c40bcc48 Refs #520. Fixed ONLY_CBLAS=1 compiling bug on OSX. 2015-03-19 11:52:09 -05:00
Zhang Xianyi
b62f9f4120 Merge pull request #518 from ton/issue-508
Fix issue #508
2015-03-18 13:00:07 -05:00
Ton van den Heuvel
b6438dedea Fix issue #508
Fix race condition during shutdown causing a crash in
gotoblas_set_affinity().
2015-03-18 13:22:43 +01:00
Hank Anderson
1d183dcda8 Added lapacke sources. 2015-02-25 16:51:08 -06:00
Zhang Xianyi
cdefdb21cd Refs #492. Fixed c/zsyr bug with negative incx. 2015-02-26 06:37:03 +08:00
Hank Anderson
e19bf3a28b Removed MSVC cpuid func when using clang. 2015-02-25 14:44:49 -06:00
Hank Anderson
3649cfbd7b Fixed EPILOGUE for clang. 2015-02-25 12:23:26 -06:00
Hank Anderson
5ae8993752 Added intrinsics for MSVC. 2015-02-25 11:52:51 -06:00
Hank Anderson
84d90d6ed8 Fixed some compiler errors/warnings for clang. 2015-02-25 11:52:25 -06:00
Hank Anderson
518e2424a8 Fixed bad filename for cpuid.S compile. 2015-02-25 11:51:29 -06:00
Zhang Xianyi
ea7f9dacf4 Refs #509. Fixed geadd building bug with DYNAMIC_ARCH=1. 2015-02-26 01:47:11 +08:00
Zhang Xianyi
bf5dbb7e2a Refs#509. Merge branch 'grisuthedragon-develop' into develop 2015-02-26 01:44:19 +08:00
Hank Anderson
00e373aea6 Added LAPACK sources directly to add_library call instead of OBJECT. 2015-02-25 10:18:18 -06:00
Hank Anderson
9eaea02f33 Added additional gemm defines for complex types. 2015-02-25 09:39:11 -06:00
Hank Anderson
ab7043373f Fixed bug generating trmv complex source names. 2015-02-24 15:18:41 -06:00
Hank Anderson
504cdb10ed Added check for MSVC before enabling fortran.
Currently forcing gfortran, instead of assuming ifort.
2015-02-24 14:31:45 -06:00
Hank Anderson
a8002b0c5f Separated getarch ASM file when using MSVC. 2015-02-24 14:31:18 -06:00
Hank Anderson
0553476fba Added TRANS defines for complex sources in lapack. 2015-02-24 14:30:35 -06:00
Hank Anderson
2416d9dbac Fixed TRANSA defines for complex sources in driver/level2. 2015-02-24 13:18:07 -06:00
Hank Anderson
0d8e227ea7 Changed strategy for setting preprocessor definitions.
Instead of generating separate object files for each permutation of
defines for a source file, GenerateNamedObjects now writes an entirely
new source file and inserts the defines as #define c statements.

This solves a problem I ran into with ar.exe where it was refusing to
link objects that had the same filename despite having different paths.
2015-02-24 12:26:33 -06:00
Hank Anderson
12d1fb2e40 Fixed incorrect object name in kernel CMakeLists.txt 2015-02-24 10:30:16 -06:00
Hank Anderson
1b7f427401 Added conj gemv objects for complex build. 2015-02-23 10:24:31 -06:00
Hank Anderson
b2284647a3 More complex objects. 2015-02-23 07:51:05 -06:00
Hank Anderson
a6116e5859 Added some more complex-only objects. 2015-02-22 17:49:28 -06:00
Hank Anderson
fb5d5bb971 Added defines for complex trmv. 2015-02-21 12:39:03 -06:00
Hank Anderson
371071d461 Added CONJ defines for trmm/trsm. 2015-02-21 10:59:02 -06:00
Hank Anderson
8a143516e3 Added alternate_name to a couple of the name mangling schemes.
Added zherk_k sources to driver/level3.
2015-02-20 17:03:33 -06:00
Hank Anderson
e5897ecb9b Added zherk_kernel.c objects to driver/level3. 2015-02-19 16:19:56 -06:00
Hank Anderson
714638c187 Added some TRMM objects for complex types. 2015-02-19 16:11:51 -06:00
Hank Anderson
e27c372e53 Fixed reuse of float_char from parent loop.
Fixed in/it/on/otcopy names.
2015-02-19 13:53:29 -06:00
Hank Anderson
f3f2b3d768 Added complex and single netlib-lapack fortran sources to lapack.cmake. 2015-02-19 12:26:11 -06:00
Hank Anderson
9492298048 Added other float types to Makefile.L3. 2015-02-18 13:01:05 -06:00
Hank Anderson
43725b82c5 ParseMakefileVars now replaces Makefile vars with CMake vars. 2015-02-18 12:23:17 -06:00
Hank Anderson
14fd3d35de Added checks for missing defines in kernel. 2015-02-18 10:25:01 -06:00
Hank Anderson
cebc07cebd ParseMakefileVars now recursively parses included makefiles. 2015-02-17 22:09:41 -06:00
Hank Anderson
33c5e8db7f Added a helper function for setting the L1 kernel defaults.
Added loop to build objects with different KERNEL defines.
2015-02-17 21:36:23 -06:00
Hank Anderson
67e39bd8fb Added mangled complex filenames to interface and lapack CMakeLists.txt. 2015-02-17 13:12:30 -06:00
Hank Anderson
9eb1499095 Added another param to GenerateNamedObjects to mangle complex source names.
There are a lot of sources for complex float types that are the same
names as the real sources, except with z prepended.
2015-02-17 10:30:28 -06:00
Martin Koehler
39cc6b21d3 Add ATLAS-style ?geadd function 2015-02-16 13:46:20 +01:00
Hank Anderson
4662a0b13a Changed generate functions to iterate through a list of float types.
This will generate obj files for SINGLE/DOUBLE/COMPLEX/DOUBLE COMPLEX.
2015-02-15 17:44:37 -06:00
Hank Anderson
e74462a3f5 Moved declarations to start of functions to satisfy MSVC C89 implementation. 2015-02-11 11:16:57 -06:00
Hank Anderson
056ba26755 Changed a number of inline calls to use __inline.
MSVC doesn't inmplement C99, so can't use the inline keyword. __inline
appears to work in MSVC and GCC.
2015-02-11 11:13:17 -06:00
Hank Anderson
a0d9a7fd83 Changed _Complex types in common_level1.h to use the typedef. 2015-02-11 11:12:14 -06:00
Hank Anderson
5d3fc092e9 Added MSVC defines to common.h.
Don't have unistd.h in MSVC.

Chagned YIELDING to use the YeildProcessor macro.
2015-02-11 11:10:45 -06:00
Hank Anderson
c94fe71278 Removed incoming-stack-boundary for MSVC.
Made float type optional for GenerateNamedObjects.

Called GenerateNamedObjects for a couple of driver/others files that
needed NAME/CNAME set.
2015-02-11 10:54:14 -06:00
Hank Anderson
d60b49e5c5 Turned off uninizialized variable warning when compiling lapack-netlib. 2015-02-10 14:36:43 -06:00
Hank Anderson
64b5a0ef84 Added AUX files from lapack-netlib. 2015-02-10 14:29:05 -06:00
Hank Anderson
162791e30e Added common objects from kernel Makefile. 2015-02-10 12:42:05 -06:00
Hank Anderson
8743093bd7 Added aux files from lapack-netlib. 2015-02-10 11:47:46 -06:00
Hank Anderson
96cf6779ca Added DLA sources from lapack-netlib.
Can't use the lapack-netlib cmake files, since they are designed to
build a complete lapack/blas library. They have their own fortran
detection and flag setup and so on. Instead I'll just recreate the
makefiles I need.

Fixed a typo in the NAME defines.
2015-02-10 11:01:01 -06:00
Hank Anderson
3b20b62423 Fixed trti2 name. 2015-02-09 15:29:28 -06:00
Hank Anderson
6ddbfea700 Added generic laswp object. 2015-02-09 15:15:58 -06:00
Hank Anderson
c0624a26be Fixed some dgemm_copy function names. 2015-02-09 14:34:29 -06:00
Hank Anderson
4bfaf1ce66 Removed some list appends I missed. 2015-02-09 12:56:55 -06:00
Hank Anderson
e8c39138c6 Removed return value from GenerateNamedObjects.
It sets DBLAS_OBJS directly to save a bunch of list appending in the
CMakeLists.txt files.
2015-02-09 12:28:09 -06:00
Hank Anderson
f992799226 Added the rest of Makefile.L3. 2015-02-09 10:47:35 -06:00
Hank Anderson
4c65afcce1 Changed kernel filenames to vars. These will need to be read from KERNEL.
Added some kernel/L3 objects.
2015-02-09 09:52:14 -06:00
Hank Anderson
7fa5c4e2fd Fixed some case issues with ARCH.
Added some kernel and driver/others objects.
2015-02-08 15:29:18 -06:00
Zhang Xianyi
771b18ae9c Detect the wrong combined flags of USE_OPENMP=1 and USE_THREAD=0. 2015-02-08 01:42:48 -06:00
Zhang Xianyi
cfa9392ffa Fix openblas_get_num_threads and openblas_get_num_procs bug with single thread. 2015-02-08 01:30:23 -06:00
Hank Anderson
fa0e6a6c93 Added the rest of the L1 kernel makefile. 2015-02-07 21:37:46 -06:00
Hank Anderson
2f59135eb6 Added gemv to level2 CMakeLists.txt. 2015-02-07 21:15:21 -06:00
Hank Anderson
38681fb1c6 Added more kernel files. 2015-02-07 12:54:30 -06:00
Hank Anderson
6b5d26e07b Added SMP sources to level2 CMakeLists.txt. 2015-02-06 16:52:19 -06:00
Hank Anderson
13d2d48e67 Added yet another naming scheme for lapack functions. 2015-02-06 13:42:20 -06:00
Hank Anderson
189fadfde0 Started implementing kernel/Makefile in cmake. 2015-02-05 21:05:11 -06:00
Hank Anderson
627d5e7401 Added SMP objects to driver/level3. 2015-02-05 12:22:48 -06:00
Hank Anderson
943fa2fb58 Fixed object names in level2. 2015-02-05 10:49:11 -06:00
Hank Anderson
1b62a4f3c9 Changed some function parameters to optional. 2015-02-05 09:39:40 -06:00
Hank Anderson
461e691127 Codes when define is absent are now a parameter to AllCombinations.
The level3 object names should now be correct.
2015-02-05 09:23:47 -06:00
Hank Anderson
cfaf1c678f Added option to append define codes with an underscore.
Fixed the code array not getting reset on subsequent AllCombinations
calls.
2015-02-05 09:17:18 -06:00
Hank Anderson
0d7bad1f35 Changed GenerateObjects to append combination codes (e.g. dtrmm_TU). 2015-02-05 09:02:54 -06:00
Hank Anderson
373a1bdadb Converted lapack/Makefile to cmake. 2015-02-04 15:47:10 -06:00
Hank Anderson
2828f6630c Added SMP sources to COMMONOBJS. 2015-02-04 14:01:36 -06:00
Hank Anderson
58cff2fed8 Added CBLAS define/naming convention to GenerateNamedObjects. 2015-02-04 11:30:15 -06:00
Hank Anderson
5690cf3f0e Added override for function names in GenerateNamedObjects.
The BLAS interface folder should now be generated the correct objects
for the DOUBLE case.
2015-02-04 10:52:19 -06:00
Hank Anderson
a0aeda6187 Added function to set defines for the object names (e.g. -DNAME=dgemm). 2015-02-04 10:37:34 -06:00
Zhang Xianyi
1ccd57ce80 Merge pull request #497 from eschnett/develop
Introduce openblas_get_num_threads and openblas_get_num_procs
2015-02-03 23:09:38 -06:00
Hank Anderson
84b3d760c4 Converted rest of Makefile.system to system.cmake. 2015-02-03 16:05:01 -06:00
Hank Anderson
0beea3a5a5 Converted LAPACK flags from Makefile.system. 2015-02-03 15:33:56 -06:00
Hank Anderson
560c96a9a7 Fixed newlines in some cmake files. 2015-02-03 15:11:15 -06:00
Hank Anderson
0ccfa60a53 Changed fortran compiler name to be uppercase and stripped of path/ext. 2015-02-03 15:09:37 -06:00
Hank Anderson
30be551502 Corrected fortran compiler name variables.
Fixed some typos.

Updated c_check to set ARCH and BINARY64/32.

Added version variables.
2015-02-03 14:21:22 -06:00
Hank Anderson
be1ce38f24 Fixed some missing parentheses. 2015-02-03 14:00:29 -06:00
Hank Anderson
e818ace11a Ported more of Makefile.system to CMake. 2015-02-03 13:34:41 -06:00
Hank Anderson
e4bfbd8258 Added fc.cmake (forgot it in last commit).
Moved a couple C compiler ifs from Makefile.system into cc.cmake.
2015-02-03 13:08:59 -06:00
Hank Anderson
2d5b442f5b Ported Fortran configuration code from Makefile.system to fc.cmake. 2015-02-03 12:32:23 -06:00
Hank Anderson
af11aff309 Ported C compiler settings from Makefile.system into new cmake file. 2015-02-03 12:00:49 -06:00
Hank Anderson
e66aa5f3b7 Ported arch dependent settings from Makefile.system to new cmake file. 2015-02-03 11:32:20 -06:00
Erik Schnetter
65a847cd36 Introduce openblas_get_num_threads and openblas_get_num_procs 2015-02-03 12:23:41 -05:00
Hank Anderson
31cf22cb4b Ported OS settings from Makefile.system into new cmake file. 2015-02-03 11:07:58 -06:00
Hank Anderson
20e593a44a Added cblas_ objects to interface CMakeLists.
Naming isn't right, though, not seeing cblas_xxxx exports in the
resulting library.
2015-02-02 16:25:30 -06:00
Hank Anderson
7194424fef Added missing common objects to the library. 2015-02-02 15:21:29 -06:00
Hank Anderson
d11bde60d0 DOUBLE define for DBLAS objects is now set in main CMakeLists.txt.
Since the objects are the same, could generate SINGLE/COMPLEX/etc here
without having to rewrite all the object enumeration code again.
2015-02-02 15:00:44 -06:00
Hank Anderson
9e154aba58 Added LAPACK object files to interface CMakeLists. 2015-02-02 12:31:15 -06:00
Hank Anderson
5057a4b4df Added openblas add_library call that uses DBLAS_OBJS ojbects. 2015-01-30 15:21:21 -06:00
Hank Anderson
3e8ea7a351 Added COMMONOBJS to driver/others CMakeLists.txt. 2015-01-30 14:06:14 -06:00
Hank Anderson
d3dcdddf75 Moved functions into util cmake file. 2015-01-30 13:47:40 -06:00
Hank Anderson
e5e7595bf9 Added paramater to GenerateObjects for defines that affect all sources. 2015-01-30 13:31:13 -06:00
Hank Anderson
7693887d61 Added empty set to the combinations generated by AllCombinations. 2015-01-30 13:01:11 -06:00
Hank Anderson
8d9b196e0d Moved loop over define combos into a function.
This function takes a set of sources and a set of preprocessor
definitions. It will iterate over the sources and build an object
file for each combination of preprocessor definitions for each
source file.
2015-01-30 12:14:44 -06:00
Hank Anderson
a6cf8aafc0 Updated level3/CMakeLists with correct defines using all combos. 2015-01-30 11:21:50 -06:00
Hank Anderson
dbdca7bf0c Added first pass at driver/level3 Makefile conversion.
Added a rather convoluted CMake function to find all combinations
of a given list. This will be useful for the object files that are
compiled multiple times with different combinations of preprocessor
definitions.
2015-01-29 22:53:11 -06:00
Hank Anderson
dabaecb2bc Moved getarch parsing code into a function. 2015-01-29 09:30:47 -06:00
Zhang Xianyi
07ff001981 Merge pull request #495 from jeromerobert/develop
Fix a segfault in gemv when MAX_STACK_ALLOC is set
2015-01-29 18:23:50 +08:00
Jerome Robert
b17ccb4c5c Fix a segfault in gemv when MAX_STACK_ALLOC is set
* stack_alloc_size is needed after the implementation call
but it may be overwritten if it's optimized to a register,
because some gemv implementation (ex: dgemv_n.S) do not
restore all register (ex: r10).
* do the same in ger.c for the same reasons even if the bug
has not been observed.
2015-01-29 09:55:57 +01:00
Hank Anderson
8c23965da3 prebuild.cmake now reads the output from getarch into CMake vars. 2015-01-28 22:57:44 -06:00
Hank Anderson
61f21b5d03 getarch_2nd now appends its output to config.h/config_kernel.h 2015-01-28 22:20:15 -06:00
Hank Anderson
8ede4a8da4 getarch now compiles and sets config.h defines properly.
Still isn't parsed into CMake variables, and getarch_2 needs to
get the same treatment.
2015-01-28 17:18:26 -06:00
Hank Anderson
1c5b6bb4f7 Added CORE define to config.h in prebuild.cmake (temporarily). 2015-01-28 16:33:48 -06:00
Hank Anderson
c5f5c7a076 Updated c_check OS/compiler/bits detection. 2015-01-28 15:47:47 -06:00
Hank Anderson
9a508abdc7 Added first pass at driver/level2 makefile conversion. 2015-01-28 14:52:15 -06:00
Hank Anderson
5eefe18ae4 Added CMakeLists.txt for the first of the BLAS folders.
It only does the double precision compile currently.

I realized I didn't finish converting Makefile.system yet, so I made
a note of that.
2015-01-27 16:17:17 -06:00
Hank Anderson
1e8bb0e0e0 Fixed architecture detection when AMD64 in c_check. 2015-01-27 14:03:46 -06:00
Hank Anderson
864b8b31de Fixed incorrect case in OS_ definition in c_check. 2015-01-27 13:54:29 -06:00
Hank Anderson
d2d15e522f Started converting lib target to CMake.
The main part of this target is looping through the BLAS subfolders
and calling make on them. Need to add CMakeLists.txt for each of these
subfolders.
2015-01-27 12:23:35 -06:00
Hank Anderson
f4d1e7a265 Hardcoded NUM_CORES to get system.cmake working. 2015-01-27 11:37:39 -06:00
Zhang Xianyi
63c6fcfa0a Merge pull request #490 from eschnett/develop
Move #include statements outside extern "C" blocks
2015-01-13 15:43:56 +08:00
Erik Schnetter
29cb47fc06 Move #include statements outside extern "C" blocks 2015-01-12 21:27:52 -05:00
Zhang Xianyi
4e6c4046f7 Fix cortex-a15 detecting bug. 2015-01-12 09:35:16 +00:00
Zhang Xianyi
229ce2ccd1 Add cortex-a9 and cortex-a15 targets. 2015-01-12 08:55:29 +00:00
Zhang Xianyi
ef75be0e51 Merge pull request #487 from kortschak/dromtg-test
Add test for drotmg bug fixed by 692b14c
2015-01-07 14:13:11 +08:00
kortschak
5344f335a8 Add test for drotmg bug fixed by 692b14c
Test requested in issue xianyi/OpenBLAS#484.

Run tests by applying the following change and then make:

	diff --git a/Makefile.rule b/Makefile.rule
	index bea1fe1..9852ff3 100644
	--- a/Makefile.rule
	+++ b/Makefile.rule
	@@ -140,7 +140,7 @@ NO_AFFINITY = 1

	-# UTEST_CHECK = 1
	+UTEST_CHECK = 1
2015-01-07 10:06:55 +10:30
Hank Anderson
0f6bec0a32 cmake.prebuild now compiles getarch.
Doesn't actually run it yet.
2015-01-01 21:03:17 -06:00
Hank Anderson
92cdac5f87 Added MSVC functions to cpuid_x86.c to replace gcc-specific ASM. 2015-01-01 21:02:48 -06:00
Hank Anderson
1a41022e3e Added MSVC defines to cpuid.h and getarch.c. 2015-01-01 21:01:28 -06:00
Zhang Xianyi
5cb5af9333 Add configuration options. 2015-01-02 02:42:32 +08:00
Zhang Xianyi
41aad0407f Merge pull request #482 from jeromerobert/develop
Allow to do gemv and ger buffer allocation on the stack
2015-01-02 02:26:17 +08:00
Hank Anderson
e5c47e44f6 First pass at converting a few makefiles to CMake. 2014-12-30 21:53:00 -06:00
Zhang Xianyi
f8f2e84659 Merge pull request #486 from wernsaar/develop
Optimizations for steamroller
2014-12-31 02:36:23 +08:00
Werner Saar
34633fef01 Merge branch 'develop' of github.com:wernsaar/OpenBLAS into develop 2014-12-30 20:16:53 +08:00
Werner Saar
ddf983d643 added optimizations for steamroller 2014-12-30 20:14:45 +08:00
Zhang Xianyi
17b9db20f1 Merge pull request #483 from wernsaar/develop
added Steamroller as a  cpu target
2014-12-29 12:00:16 +08:00
Werner Saar
0dc559ed30 bugfix in dynamic.c 2014-12-28 17:15:42 +01:00
Werner Saar
9566f5fdb0 added Steamroller as a target processor 2014-12-28 13:45:19 +01:00
Werner Saar
4319769b79 added target processor STEAMROLLER 2014-12-28 20:16:46 +08:00
Jerome Robert
e9d9a8eae3 Allow to do gemv and ger buffer allocation on the stack
ger and gemv call blas_memory_alloc/free which in their turn
call blas_lock. blas_lock create thread contention when matrices
are small and the number of thread is high enough. We avoid
call blas_memory_alloc by replacing it with stack allocation.
This can be enabled with:
make -DMAX_STACK_ALLOC=2048
The given size (in byte) must be high enough to avoid thread contention
and small enough to avoid stack overflow.

Fix #478
2014-12-27 14:33:12 +01:00
Zhang Xianyi
cbb3ab80e7 Merge pull request #481 from eschnett/develop
Correct ilaver C declaration
2014-12-26 10:09:19 +08:00
Erik Schnetter
cd9868b1b4 Correct ilaver C declaration 2014-12-25 17:41:17 -05:00
Zhang Xianyi
eb738148fe Merge pull request #479 from wernsaar/develop
workaround for sandybridge zgemm kernel
2014-12-23 00:59:41 +08:00
Werner Saar
587e16fba3 Ref #458: Backport, sandybrigde uses nehalem zgemm kernel 2014-12-22 17:01:18 +01:00
Werner Saar
4de7b9ae47 increased NMAX to 128 2014-12-22 14:04:27 +01:00
Werner Saar
887aed634d modified sources for OS Darwin 2014-12-19 12:40:46 +01:00
Werner Saar
6261342de3 small optimization on dgemm_kernel for N=1 2014-12-18 20:35:51 +01:00
Werner Saar
1e566223ed added code for the size of n 2014-12-17 15:02:11 +01:00
Werner Saar
113b48ca22 modified makefile for acml6.1 2014-12-17 14:12:21 +01:00
Zhang Xianyi
3e81c99b6b Fixed installation bug on Mac OSX. 2014-12-13 13:05:06 +08:00
Werner Saar
ec85c4a51d Increased the Threshold value in sep.in 2014-12-11 14:57:41 +01:00
Werner Saar
97de657d38 added tests to sep.as as workaround for gfortran-4.8.x 2014-12-11 13:53:59 +01:00
Zhang Xianyi
71966eba6c Merge pull request #475 from xantares/patch-2
add OpenBLAS_VERSION to cmake config file
2014-12-09 17:57:43 +08:00
Zhang Xianyi
a359979e17 Merge pull request #474 from xantares/patch-1
set OPENBLAS_CMAKE_DIR to <prefix>/lib/cmake/<package_name>
2014-12-09 17:57:16 +08:00
xantares
7a6a141bc4 add OpenBLAS_VERSION to cmake config file 2014-12-09 10:34:41 +01:00
xantares
b8ff6892f6 set OPENBLAS_CMAKE_DIR to <prefix>/lib/cmake/<package_name>
usually these files are more often located in this subdir
2014-12-09 10:18:18 +01:00
Zhang Xianyi
8fe7a9ce6f Merge pull request #473 from wernsaar/develop
changed inline assembler labels to short form
2014-12-08 13:22:18 +08:00
Werner Saar
bc5fff7085 changed inline assembler labels to short form 2014-12-07 12:38:54 +01:00
Zhang Xianyi
51ce5ef447 Merge branch 'develop' 2014-12-03 23:14:21 +08:00
Zhang Xianyi
1943ea91a8 Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2014-12-03 23:03:48 +08:00
Zhang Xianyi
37aee1f9b1 Merge branch 'develop' 2014-12-03 23:01:33 +08:00
Zhang Xianyi
f5424fc9de Update the doc for 0.2.13 version. 2014-12-03 23:00:29 +08:00
Zhang Xianyi
0cf29ba6d2 Fixed a bug of sgemm sandy bridge kernel.
Reported by Julia project. JuliaLang/julia#9084
2014-12-03 17:38:41 +08:00
Zhang Xianyi
50e18033e6 Merge pull request #471 from nolta/patch-4
c_check: set $hostarch to x86_64 instead of amd64
2014-12-03 12:53:20 +08:00
Zhang Xianyi
551b55d1c7 Merge pull request #470 from nolta/patch-3
fix fortran compiler detection on FreeBSD
2014-12-03 12:50:46 +08:00
Mike Nolta
271ceb8bae c_check: set $hostarch to x86_64 instead of amd64
`uname -m` returns "amd64" on some systems.
2014-12-02 21:23:23 -05:00
Mike Nolta
5f846be2e4 fix fortran compiler detection on FreeBSD
On FreeBSD, passing extra options to `which` causes it to report a non-zero status:

```
$ which gfortran48 -m64
/usr/local/bin/gfortran48
$ echo $?
1
```

```
$ which gfortran48
/usr/local/bin/gfortran48
$ echo $?
0
```
2014-12-02 20:47:40 -05:00
Zhang Xianyi
fe7dcf98f3 Refs #461. Provide OpenBLASConfig.cmake to support CMake.
If you "make PREFIX=/path/to/OpenBLAS install" ,
The config file will be located in /path/to/OpenBLAS/cmake

Then, you can use "find_package(OpenBLAS)" at CMake.
cmake -DOpenBLAS_DIR=/path/to/OpenBLAS/cmake ..
2014-11-29 02:16:40 +08:00
Zhang Xianyi
2fb02626da Update organization info. 2014-11-25 15:28:58 +08:00
Zhang Xianyi
a85c2785ae Refs #467. Added generic kernel file for x86_64. 2014-11-24 15:34:48 +08:00
Zhang Xianyi
4806715c97 Fixed #456. Merged the optimizations for APM's
xgene-1 (aarch64).
Merge branch 'benedikt-huber-dave-patch' into develop
2014-11-11 22:21:04 +08:00
Benedikt Huber
58c90d5937 # The first commit's message is:
Optimizations for APM's xgene-1 (aarch64).

1) general system updates to support armv8 better.  Make all did not work, one needed to supply TARGET=ARMV8.
2) sgem 4x4 kernel in assembler using SIMD, and configuration changes to use it.
3) strmm 4x4 kernel in C.  Since the sgem kernel does 4x4, the trmm kernel must also do 4xN.

Added Dave Nuechterlein to the contributors list.
2014-11-11 22:19:23 +08:00
Zhang Xianyi
2987bc7b40 refs #464. Fixed the bug of detecting L2 associative on x86. 2014-11-10 17:15:34 +08:00
Zhang Xianyi
695e0fa649 #463 fixed a compiling bug on AIX. 2014-11-10 14:39:56 +08:00
Zhang Xianyi
cbb23c46c2 Merge pull request #459 from tkelman/symbol-rename
add SYMBOLPREFIX and SYMBOLSUFFIX makefile options
2014-10-25 19:49:03 +08:00
Tony Kelman
0b4602b753 add SYMBOLPREFIX and SYMBOLSUFFIX makefile options
for adding a prefix or suffix to all exported symbol names in the shared library
Useful to avoid conflicts with other BLAS libraries, especially when using
64 bit integer interfaces in OpenBLAS

Note that since OSX does not have the objcopy utility, setting these options
to non-empty values on Mac requires the objconv tool, available (GPL license)
from http://www.agner.org/optimize/#objconv
2014-10-24 22:27:09 -07:00
3665 changed files with 196458 additions and 32342 deletions

4
.gitignore vendored
View File

@@ -15,6 +15,7 @@ lapack-netlib/make.inc
lapack-netlib/lapacke/include/lapacke_mangling.h
lapack-netlib/TESTING/testing_results.txt
*.so
*.so.*
*.a
.svn
*~
@@ -65,3 +66,6 @@ test/sblat3
test/zblat1
test/zblat2
test/zblat3
build
build.*
*.swp

View File

@@ -1,4 +1,13 @@
language: c
notifications:
webhooks:
urls:
- https://webhooks.gitter.im/e/8a6e4470a0cebd090344
on_success: change # options: [always|never|change] default: always
on_failure: always # options: [always|never|change] default: always
on_start: never # options: [always|never|change] default: always
compiler:
- gcc

190
CMakeLists.txt Normal file
View File

@@ -0,0 +1,190 @@
##
## Author: Hank Anderson <hank@statease.com>
##
cmake_minimum_required(VERSION 2.8.4)
project(OpenBLAS)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 2)
set(OpenBLAS_PATCH_VERSION 16.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
enable_language(ASM)
enable_language(C)
if(MSVC)
set(OpenBLAS_LIBNAME libopenblas)
else()
set(OpenBLAS_LIBNAME openblas)
endif()
#######
if(MSVC)
option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
option(BUILD_DEBUG "Build Debug Version" OFF)
#######
if(BUILD_WITHOUT_LAPACK)
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif()
if(BUILD_DEBUG)
set(CMAKE_BUILD_TYPE Debug)
else()
set(CMAKE_BUILD_TYPE Release)
endif()
if(BUILD_WITHOUT_CBLAS)
set(NO_CBLAS 1)
endif()
#######
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake")
include("${CMAKE_SOURCE_DIR}/cmake/system.cmake")
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
if (NOT DYNAMIC_ARCH)
list(APPEND BLASDIRS kernel)
endif ()
if (DEFINED UTEST_CHECK)
set(SANITY_CHECK 1)
endif ()
if (DEFINED SANITY_CHECK)
list(APPEND BLASDIRS reference)
endif ()
set(SUBDIRS ${BLASDIRS})
if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
endif ()
# set which float types we want to build for
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
# if none are defined, build for all
set(BUILD_SINGLE true)
set(BUILD_DOUBLE true)
set(BUILD_COMPLEX true)
set(BUILD_COMPLEX16 true)
endif ()
set(FLOAT_TYPES "")
if (BUILD_SINGLE)
message(STATUS "Building Single Precision")
list(APPEND FLOAT_TYPES "SINGLE") # defines nothing
endif ()
if (BUILD_DOUBLE)
message(STATUS "Building Double Precision")
list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE
endif ()
if (BUILD_COMPLEX)
message(STATUS "Building Complex Precision")
list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX
endif ()
if (BUILD_COMPLEX16)
message(STATUS "Building Double Complex Precision")
list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE
endif ()
set(SUBDIRS_ALL ${SUBDIRS} test ctest utest exports benchmark ../laswp ../bench)
# all :: libs netlib tests shared
# libs :
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.")
endif ()
if (${NO_STATIC} AND ${NO_SHARED})
message(FATAL_ERROR "Neither static nor shared are enabled.")
endif ()
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
set(TARGET_OBJS "")
foreach (SUBDIR ${SUBDIRS})
add_subdirectory(${SUBDIR})
string(REPLACE "/" "_" subdir_obj ${SUBDIR})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:${subdir_obj}>")
endforeach ()
# netlib:
# Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke.
# Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want.
if (NOT NOFORTRAN AND NOT NO_LAPACK)
include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake")
if (NOT NO_LAPACKE)
include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake")
endif ()
endif ()
#Only generate .def for dll on MSVC
if(MSVC)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
if(NOT MSVC)
#only build shared library for MSVC
add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
if(SMP)
target_link_libraries(${OpenBLAS_LIBNAME} pthread)
target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
endif()
#build test and ctest
enable_testing()
add_subdirectory(test)
if(NOT NO_CBLAS)
add_subdirectory(ctest)
endif()
endif()
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
SOVERSION ${OpenBLAS_MAJOR_VERSION}
)
# TODO: Why is the config saved here? Is this necessary with CMake?
#Save the config files for installation
# @cp Makefile.conf Makefile.conf_last
# @cp config.h config_last.h
#ifdef QUAD_PRECISION
# @echo "#define QUAD_PRECISION">> config_last.h
#endif
#ifeq ($(EXPRECISION), 1)
# @echo "#define EXPRECISION">> config_last.h
#endif
###
#ifeq ($(DYNAMIC_ARCH), 1)
# @$(MAKE) -C kernel commonlibs || exit 1
# @for d in $(DYNAMIC_CORE) ; \
# do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
# done
# @echo DYNAMIC_ARCH=1 >> Makefile.conf_last
#endif
#ifdef USE_THREAD
# @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
#endif
# @touch lib.grd

View File

@@ -117,5 +117,18 @@ In chronological order:
* Isaac Dunham <https://github.com/idunham>
* [2014-08-03] Fixed link error on Linux/musl
* Dave Nuechterlein
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
ARMv8 support.
* Dan Kortschak
* [2015-01-07] Added test for drotmg bug #484.
* Ton van den Heuvel <https://github.com/ton>
* [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity().
* Martin Koehler <https://github.com/grisuthedragon/>
* [2015-09-07] Improved imatcopy
* [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes]

View File

@@ -1,4 +1,98 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.15
27-Oct-2015
common:
* Support cmake on x86/x86-64. Natively compiling on MS Visual Studio.
(experimental. Thank Hank Anderson for the initial cmake porting work.)
On Linux and Mac OSX, OpenBLAS cmake supports assembly kernels.
e.g. cmake .
make
make test (Optional)
On Windows MS Visual Studio, OpenBLAS cmake only support C kernels.
(OpenBLAS uses AT&T style assembly, which is not supported by MSVC.)
e.g. cmake -G "Visual Studio 12 Win64" .
Open OpenBLAS.sln and build.
* Enable MAX_STACK_ALLOC flags by default.
Improve ger and gemv for small matrices.
* Improve gemv parallel with small m and large n case.
* Improve ?imatcopy when lda==ldb (#633. Thanks, Martin Koehler)
* Add vecLib benchmarks (#565. Thanks, Andreas Noack.)
* Fix LAPACK lantr for row major matrices (#634. Thanks, Dan Kortschak)
* Fix LAPACKE lansy (#640. Thanks, Dan Kortschak)
* Import bug fixes for LAPACKE s/dormlq, c/zunmlq
* Raise the signal when pthread_create fails (#668. Thanks, James K. Lowden)
* Remove g77 from compiler list.
* Enable AppVeyor Windows CI.
x86/x86-64:
* Support pure C generic kernels for x86/x86-64.
* Support Intel Boardwell and Skylake by Haswell kernels.
* Support AMD Excavator by Steamroller kernels.
* Optimize s/d/c/zdot for Intel SandyBridge and Haswell.
* Optimize s/d/c/zdot for AMD Piledriver and Steamroller.
* Optimize s/d/c/zapxy for Intel SandyBridge and Haswell.
* Optimize s/d/c/zapxy for AMD Piledriver and Steamroller.
* Optimize d/c/zscal for Intel Haswell, dscal for Intel SandyBridge.
* Optimize d/c/zscal for AMD Bulldozer, Piledriver and Steamroller.
* Optimize s/dger for Intel SandyBridge.
* Optimize s/dsymv for Intel SandyBridge.
* Optimize ssymv for Intel Haswell.
* Optimize dgemv for Intel Nehalem and Haswell.
* Optimize dtrmm for Intel Haswell.
ARM:
* Support Android NDK armeabi-v7a-hard ABI (-mfloat-abi=hard)
e.g. make HOSTCC=gcc CC=arm-linux-androideabi-gcc NO_LAPACK=1 TARGET=ARMV7
* Fix lock, rpcc bugs (#616, #617. Thanks, Grazvydas Ignotas)
POWER:
* Support ppc64le platform (ELF ABI v2. #612. Thanks, Matthew Brandyberry.)
* Support POWER7/8 by POWER6 kernels. (#612. Thanks, Fábio Perez.)
====================================================================
Version 0.2.14
24-Mar-2015
common:
* Improve OpenBLASConfig.cmake. (#474, #475. Thanks, xantares.)
* Improve ger and gemv for small matrices by stack allocation.
e.g. make -DMAX_STACK_ALLOC=2048 (#482. Thanks, Jerome Robert.)
* Introduce openblas_get_num_threads and openblas_get_num_procs.
(#497. Thanks, Erik Schnetter.)
* Add ATLAS-style ?geadd function. (#509. Thanks, Martin Köhler.)
* Fix c/zsyr bug with negative incx. (#492.)
* Fix race condition during shutdown causing a crash in
gotoblas_set_affinity(). (#508. Thanks, Ton van den Heuvel.)
x86/x86-64:
* Support AMD Streamroller.
ARM:
* Add Cortex-A9 and Cortex-A15 targets.
====================================================================
Version 0.2.13
3-Dec-2014
common:
* Add SYMBOLPREFIX and SYMBOLSUFFIX makefile options
for adding a prefix or suffix to all exported symbol names
in the shared library.(#459, Thanks Tony Kelman)
* Provide OpenBLASConfig.cmake at installation.
* Fix Fortran compiler detection on FreeBSD.
(#470, Thanks Mike Nolta)
x86/x86-64:
* Add generic kernel files for x86-64. make TARGET=GENERIC
* Fix a bug of sgemm kernel on Intel Sandy Bridge.
* Fix c_check bug on some amd64 systems. (#471, Thanks Mike Nolta)
ARM:
* Support APM's X-Gene 1 AArch64 processors.
Optimize trmm and sgemm. (#465, Thanks Dave Nuechterlein)
====================================================================
Version 0.2.12
13-Oct-2014

View File

@@ -9,10 +9,10 @@
If you want to allocate 64 large pages,
$shell> echo 0 > /pros/sys/vm/nr_hugepages # need to be reset
$shell> echo 65 > /pros/sys/vm/nr_hugepages # add 1 extra page
$shell> echo 3355443200 > /pros/sys/kernel/shmmax # just large number
$shell> echo 3355443200 > /pros/sys/kernel/shmall
$shell> echo 0 > /proc/sys/vm/nr_hugepages # need to be reset
$shell> echo 65 > /proc/sys/vm/nr_hugepages # add 1 extra page
$shell> echo 3355443200 > /proc/sys/kernel/shmmax # just large number
$shell> echo 3355443200 > /proc/sys/kernel/shmall
Also may add a few lines into /etc/security/limits.conf file.

View File

@@ -1,4 +1,4 @@
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
Copyright (c) 2011-2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -12,9 +12,10 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

View File

@@ -20,6 +20,8 @@ ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
.PHONY : all libs netlib test ctest shared install
@@ -131,7 +133,7 @@ ifeq ($(CORE), UNKOWN)
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
endif
ifeq ($(NOFORTRAN), 1)
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
$(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.)
endif
ifeq ($(NO_STATIC), 1)
ifeq ($(NO_SHARED), 1)
@@ -231,7 +233,7 @@ ifndef NOFORTRAN
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "NOOPT = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -247,10 +249,14 @@ ifndef NOFORTRAN
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
ifeq ($(FC), gfortran)
ifeq ($(F_COMPILER), GFORTRAN)
-@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc
ifdef SMP
ifeq ($(OSNAME), WINNT)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
else
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
@@ -286,7 +292,17 @@ endif
lapack-test :
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
endif
lapack-runtest:
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)

View File

@@ -1,8 +1,23 @@
ifeq ($(CORE), ARMV7)
# ifeq logical or
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV7)
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
@@ -11,8 +26,8 @@ endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
CCOMMON_OPT += -marm -march=armv5
FCOMMON_OPT += -marm -march=armv5
endif

View File

@@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a
endif
ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
endif

View File

@@ -9,6 +9,9 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
OPENBLAS_BINARY_DIR := $(PREFIX)/bin
OPENBLAS_BUILD_DIR := $(CURDIR)
OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
.PHONY : install
.NOTPARALLEL : install
@@ -21,6 +24,7 @@ install : lib.grd
@-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@@ -43,11 +47,11 @@ ifndef NO_CBLAS
endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
endif
#for install static library
@@ -83,13 +87,43 @@ ifeq ($(OSNAME), Darwin)
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
@-cp $(LIBDLLNAME).a $(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
endif
ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
endif
endif
#Generating OpenBLASConfig.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
endif
ifeq ($(OSNAME), Darwin)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
endif
else
#only static
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
endif
#Generating OpenBLASConfigVersion.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo Install OK!

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.2.12
VERSION = 0.2.16.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -80,7 +80,7 @@ VERSION = 0.2.12
# NO_LAPACKE = 1
# If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
USE_SIMPLE_THREADED_LEVEL3 = 1
# If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you
@@ -159,6 +159,22 @@ COMMON_PROF = -pg
# Build Debug version
# DEBUG = 1
# Improve GEMV and GER for small matrices by stack allocation.
# For details, https://github.com/xianyi/OpenBLAS/pull/482
#
MAX_STACK_ALLOC=2048
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoid conflicts with other BLAS libraries, especially when using
# 64 bit integer interfaces in OpenBLAS.
# For details, https://github.com/xianyi/OpenBLAS/pull/459
#
# The same prefix and suffix are also added to the library name,
# i.e. you get lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) rather than libopenblas
#
# SYMBOLPREFIX=
# SYMBOLSUFFIX=
#
# End of user configuration
#

View File

@@ -23,6 +23,7 @@ CC = gcc
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
endif
endif
@@ -61,6 +62,12 @@ endif
ifeq ($(TARGET), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), STEAMROLLER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), EXCAVATOR)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
@@ -85,6 +92,12 @@ endif
ifeq ($(TARGET_CORE), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), STEAMROLLER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), EXCAVATOR)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
@@ -186,13 +199,21 @@ LD = $(CROSS_SUFFIX)ld
RANLIB = $(CROSS_SUFFIX)ranlib
NM = $(CROSS_SUFFIX)nm
DLLWRAP = $(CROSS_SUFFIX)dllwrap
OBJCOPY = $(CROSS_SUFFIX)objcopy
OBJCONV = $(CROSS_SUFFIX)objconv
# For detect fortran failed, only build BLAS.
ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1
endif
#
# OS dependent settings
#
ifeq ($(OSNAME), Darwin)
export MACOSX_DEPLOYMENT_TARGET=10.2
export MACOSX_DEPLOYMENT_TARGET=10.6
MD5SUM = md5 -r
endif
@@ -303,6 +324,10 @@ ifdef SANITY_CHECK
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
endif
ifdef MAX_STACK_ALLOC
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
#
# Architecture dependent settings
#
@@ -311,6 +336,11 @@ ifeq ($(ARCH), x86)
ifndef BINARY
NO_BINARY_MODE = 1
endif
ifeq ($(CORE), generic)
NO_EXPRECISION = 1
endif
ifndef NO_EXPRECISION
ifeq ($(F_COMPILER), GFORTRAN)
# ifeq logical or. GCC or LSB
@@ -329,6 +359,11 @@ endif
endif
ifeq ($(ARCH), x86_64)
ifeq ($(CORE), generic)
NO_EXPRECISION = 1
endif
ifndef NO_EXPRECISION
ifeq ($(F_COMPILER), GFORTRAN)
# ifeq logical or. GCC or LSB
@@ -352,6 +387,12 @@ endif
ifeq ($(USE_OPENMP), 1)
#check
ifeq ($(USE_THREAD), 0)
$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
endif
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
CCOMMON_OPT += -fopenmp
@@ -390,7 +431,7 @@ endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
endif
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL
@@ -560,7 +601,7 @@ else
FCOMMON_OPT += -m32
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -fopenmp
endif
endif
@@ -572,14 +613,14 @@ ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
ifeq ($(F_COMPILER), FUJITSU)
CCOMMON_OPT += -DF_INTERFACE_FUJITSU
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
@@ -597,7 +638,7 @@ endif
else
FCOMMON_OPT += -q32
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
@@ -615,7 +656,7 @@ FCOMMON_OPT += -tp p7-64
else
FCOMMON_OPT += -tp p7
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
endif
@@ -644,7 +685,7 @@ FCOMMON_OPT += -mabi=n32
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
endif
@@ -681,7 +722,7 @@ FCOMMON_OPT += -m64
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FEXTRALIB += -lstdc++
FCOMMON_OPT += -mp
endif
@@ -729,14 +770,14 @@ FCOMMON_OPT += -m32
else
FCOMMON_OPT += -m64
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -xopenmp=parallel
endif
endif
ifeq ($(F_COMPILER), COMPAQ)
CCOMMON_OPT += -DF_INTERFACE_COMPAQ
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
@@ -839,10 +880,18 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif
ifndef SYMBOLPREFIX
SYMBOLPREFIX =
endif
ifndef SYMBOLSUFFIX
SYMBOLSUFFIX =
endif
ifndef LIBNAMESUFFIX
LIBPREFIX = libopenblas
LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)
else
LIBPREFIX = libopenblas_$(LIBNAMESUFFIX)
LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
endif
KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
@@ -922,16 +971,29 @@ ifeq ($(DEBUG), 1)
COMMON_OPT += -g
endif
ifeq ($(DEBUG), 1)
FCOMMON_OPT += -g
endif
ifndef COMMON_OPT
COMMON_OPT = -O2
endif
ifndef FCOMMON_OPT
ifeq ($(OSNAME), WINNT)
FCOMMON_OPT = -O0
else
FCOMMON_OPT = -O2 -frecursive
endif
endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
override FFLAGS += $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
#For LAPACK Fortran codes.

View File

@@ -1,7 +1,10 @@
# OpenBLAS
[![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
[![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
@@ -60,6 +63,7 @@ Please read GotoBLAS_01Readme.txt
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.

View File

@@ -32,6 +32,8 @@ ISTANBUL
BOBCAT
BULLDOZER
PILEDRIVER
STEAMROLLER
EXCAVATOR
c)VIA CPU:
SSE_GENERIC
@@ -42,6 +44,8 @@ NANO
POWER4
POWER5
POWER6
POWER7
POWER8
PPCG4
PPC970
PPC970MP
@@ -62,6 +66,13 @@ SPARC
SPARCV7
6.ARM CPU:
CORTEXA15
CORTEXA9
ARMV7
ARMV6
ARMV5
7.ARM 64-bit CPU:
ARMV8
CORTEXA57

42
appveyor.yml Normal file
View File

@@ -0,0 +1,42 @@
version: 0.2.15.{build}
#environment:
platform:
- x64
configuration: Release
clone_folder: c:\projects\OpenBLAS
init:
- git config --global core.autocrlf input
build:
project: OpenBLAS.sln
clone_depth: 5
#branches to build
branches:
only:
- master
- develop
- cmake
skip_tags: true
matrix:
fast_finish: true
skip_commits:
# Add [av skip] to commit messages
message: /\[av skip\]/
before_build:
- echo Running cmake...
- cd c:\projects\OpenBLAS
- cmake -G "Visual Studio 12 Win64" .
test_script:
- echo Build OK!

9
benchmark/Make_exe.sh Executable file
View File

@@ -0,0 +1,9 @@
#!/bin/bash
for f in *.goto *.acml *.mkl *.atlas
do
if [ -f "$f" ]; then
mv $f `echo $f|tr '.' '_'`.exe
fi
done

File diff suppressed because it is too large Load Diff

196
benchmark/asum.c Normal file
View File

@@ -0,0 +1,196 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef ASUM
#ifdef COMPLEX
#ifdef DOUBLE
#define ASUM BLASFUNC(dzasum)
#else
#define ASUM BLASFUNC(scasum)
#endif
#else
#ifdef DOUBLE
#define ASUM BLASFUNC(dasum)
#else
#define ASUM BLASFUNC(sasum)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
FLOAT result;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = ASUM (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6);
#else
fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6);
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -114,7 +114,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
@@ -198,4 +198,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -71,8 +71,14 @@ double fabs(double);
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
@@ -99,6 +105,7 @@ int gettimeofday(struct timeval *tv, void *tz){
#endif
static __inline double getmflops(int ratio, int m, double secs){
double mm = (double)m;
@@ -117,7 +124,7 @@ static __inline double getmflops(int ratio, int m, double secs){
}
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
#ifndef COMPLEX
char *trans[] = {"T", "N"};
@@ -273,4 +280,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

201
benchmark/copy.c Normal file
View File

@@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef COPY
#ifdef COMPLEX
#ifdef DOUBLE
#define COPY BLASFUNC(zcopy)
#else
#define COPY BLASFUNC(ccopy)
#endif
#else
#ifdef DOUBLE
#define COPY BLASFUNC(dcopy)
#else
#define COPY BLASFUNC(scopy)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
COPY (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT result;
@@ -192,4 +192,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -139,11 +139,12 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
FLOAT wkopt[4];
char job='V';
char jobr='N';
char *p;
blasint m, i, j, info,lwork;
@@ -202,9 +203,9 @@ int MAIN__(int argc, char *argv[]){
lwork = -1;
m=to;
#ifndef COMPLEX
GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
#else
GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
#endif
lwork = (blasint)wkopt[0];
@@ -226,16 +227,16 @@ int MAIN__(int argc, char *argv[]){
lwork = -1;
#ifndef COMPLEX
GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
#else
GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
#endif
lwork = (blasint)wkopt[0];
#ifndef COMPLEX
GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info);
#else
GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
#endif
gettimeofday( &stop, (struct timezone *)0);
@@ -257,4 +258,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -118,14 +118,15 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
FLOAT beta [] = {0.0, 0.0};
char trans='N';
blasint m, i, j;
blasint m, n, i, j;
int loops = 1;
int has_param_n=0;
int l;
char *p;
@@ -162,51 +163,61 @@ int MAIN__(int argc, char *argv[]){
if ( p != NULL )
loops = atoi(p);
if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p);
has_param_n=1;
}
#ifdef linux
srandom(getpid());
#endif
for(j = 0; j < to; j++){
for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
fprintf(stderr, " SIZE Flops\n");
fprintf(stderr, " SIZE Flops Time\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
if ( has_param_n == 1 && n <= m )
n=n;
else
n=m;
fprintf(stderr, " %6dx%d : ", (int)m, (int)n);
gettimeofday( &start, (struct timezone *)0);
for (l=0; l<loops; l++)
{
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m );
GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg /= loops;
timeg = time1/loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@@ -209,4 +209,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@@ -266,4 +266,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -35,12 +35,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#undef GER
#ifdef COMPLEX
#ifdef DOUBLE
#define GER BLASFUNC(zgeru)
#else
#define GER BLASFUNC(cgeru)
#endif
#else
#ifdef DOUBLE
#define GER BLASFUNC(dger)
#else
#define GER BLASFUNC(sger)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
@@ -108,7 +115,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@@ -214,5 +221,5 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

218
benchmark/gesv.c Normal file
View File

@@ -0,0 +1,218 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
double fabs(double);
#undef GESV
#undef GETRS
#ifndef COMPLEX
#ifdef XDOUBLE
#define GESV BLASFUNC(qgesv)
#elif defined(DOUBLE)
#define GESV BLASFUNC(dgesv)
#else
#define GESV BLASFUNC(sgesv)
#endif
#else
#ifdef XDOUBLE
#define GESV BLASFUNC(xgesv)
#elif defined(DOUBLE)
#define GESV BLASFUNC(zgesv)
#else
#define GESV BLASFUNC(cgesv)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b;
blasint *ipiv;
blasint m, i, j, info;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time\n");
for(m = from; m <= to; m += step){
fprintf(stderr, " %dx%d : ", (int)m, (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
b[i + j * m * COMPSIZE] = 0.0;
}
}
for (j = 0; j < m; ++j) {
for (i = 0; i < m * COMPSIZE; ++i) {
b[i] += a[i + j * m * COMPSIZE];
}
}
gettimeofday( &start, (struct timezone *)0);
GESV (&m, &m, a, &m, ipiv, b, &m, &info);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
fprintf(stderr,
"%10.2f MFlops %10.6f s\n",
COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a,*work;
FLOAT wkopt[4];
@@ -231,4 +231,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -107,7 +107,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@@ -189,4 +189,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@@ -205,4 +205,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -106,7 +106,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@@ -188,4 +188,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *c;
FLOAT alpha[] = {1.0, 1.0};
@@ -186,4 +186,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b;
blasint *ipiv;
@@ -270,4 +270,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -52,6 +52,11 @@ C)
awk '/MFlops/ { print $3,int($9) }'|tail --lines=+2
;;
B)
# Copy Benchmark
awk '/MBytes/ { print $1,int($3) }'|tail --lines=+2
;;
*)
awk '/MFlops/ { print $1,int($3) }'|tail --lines=+2

View File

@@ -88,6 +88,10 @@ double fabs(double);
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
@@ -114,7 +118,7 @@ int gettimeofday(struct timeval *tv, void *tz){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
#ifndef COMPLEX
char *trans[] = {"T", "N"};
@@ -278,5 +282,5 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

202
benchmark/scal.c Normal file
View File

@@ -0,0 +1,202 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SCAL
#ifdef COMPLEX
#ifdef DOUBLE
#define SCAL BLASFUNC(zscal)
#else
#define SCAL BLASFUNC(cscal)
#endif
#else
#ifdef DOUBLE
#define SCAL BLASFUNC(dscal)
#else
#define SCAL BLASFUNC(sscal)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SCAL (&m, alpha, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6);
#else
fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6);
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_cgemm(N,l):
A = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j;
B = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_cgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_cgemv(N,l):
A = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j;
B = randn(N).astype('float32') + randn(N).astype('float32') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_cgemv(i,LOOPS)

View File

@@ -0,0 +1,58 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
from scipy.linalg.blas import daxpy
def run_daxpy(N,l):
x = randn(N).astype('float64')
y = randn(N).astype('float64')
start = time.time();
for i in range(0,l):
y = daxpy(x,y, a=2.0 )
end = time.time()
timediff = (end -start)
mflops = ( 2*N ) *l / timediff
mflops *= 1e-6
size = "%d" % (N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_daxpy(i,LOOPS)

56
benchmark/scripts/NUMPY/ddot.py Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_ddot(N,l):
A = randn(N).astype('float64')
B = randn(N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N ) *l / timediff
mflops *= 1e-6
size = "%d" % (N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_ddot(i,LOOPS)

55
benchmark/scripts/NUMPY/deig.py Executable file
View File

@@ -0,0 +1,55 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_deig(N,l):
A = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
la,v = numpy.linalg.eig(A)
end = time.time()
timediff = (end -start)
mflops = ( 26.33 *N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_deig(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_dgemm(N,l):
A = randn(N,N).astype('float64')
B = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_dgemv(N,l):
A = randn(N,N).astype('float64')
B = randn(N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dgemv(i,LOOPS)

View File

@@ -0,0 +1,58 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
from scipy.linalg.lapack import dgesv
def run_dgesv(N,l):
a = randn(N,N).astype('float64')
b = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
dgesv(a,b,1,1)
end = time.time()
timediff = (end -start)
mflops = ( 2.0/3.0 *N*N*N + 2.0*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dgesv(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_dsolve(N,l):
A = randn(N,N).astype('float64')
B = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.linalg.solve(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2.0/3.0 *N*N*N + 2.0*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dsolve(i,LOOPS)

56
benchmark/scripts/NUMPY/sdot.py Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_sdot(N,l):
A = randn(N).astype('float32')
B = randn(N).astype('float32')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N ) *l / timediff
mflops *= 1e-6
size = "%d" % (N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_sdot(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_sgemm(N,l):
A = randn(N,N).astype('float32')
B = randn(N,N).astype('float32')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_sgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_sgemv(N,l):
A = randn(N,N).astype('float32')
B = randn(N).astype('float32')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_sgemv(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_zgemm(N,l):
A = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j;
B = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_zgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_zgemv(N,l):
A = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j;
B = randn(N).astype('float64') + randn(N).astype('float64') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_zgemv(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n)) + single(rand(n,n)) * 1i;
B = single(rand(n,n)) + single(rand(n,n)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n)) + single(rand(n,n)) * 1i;
B = single(rand(n,1)) + single(rand(n,1)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

56
benchmark/scripts/OCTAVE/deig.m Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
start = clock();
l=0;
while l < loops
[V,lambda] = eig(A);
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 26.33 *n*n*n ) *loops / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg );
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
B = double(rand(n,n));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
B = double(rand(n,1));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,59 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
B = double(rand(n,n));
start = clock();
l=0;
while l < loops
x = linsolve(A,B);
#x = A / B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
#r = norm(A*x - B)/norm(B)
mflops = ( 2.0/3.0 *n*n*n + 2.0*n*n*n ) *loops / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg );
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n));
B = single(rand(n,n));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n));
B = single(rand(n,1));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n)) + double(rand(n,n)) * 1i;
B = double(rand(n,n)) + double(rand(n,n)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n)) + double(rand(n,n)) * 1i;
B = double(rand(n,1)) + double(rand(n,1)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

62
benchmark/scripts/R/deig.R Executable file
View File

@@ -0,0 +1,62 @@
#!/usr/bin/Rscript
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
start <- proc.time()[3]
while ( l <= loops ) {
ev <- eigen(A)
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = (26.66 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
}

63
benchmark/scripts/R/dgemm.R Executable file
View File

@@ -0,0 +1,63 @@
#!/usr/bin/Rscript
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
start <- proc.time()[3]
while ( l <= loops ) {
C <- A %*% B
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = ( 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
}

63
benchmark/scripts/R/dsolve.R Executable file
View File

@@ -0,0 +1,63 @@
#!/usr/bin/Rscript
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
start <- proc.time()[3]
while ( l <= loops ) {
solve(A,B)
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = (2.0/3.0 *n*n*n + 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
}

201
benchmark/swap.c Normal file
View File

@@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above swapright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above swapright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE SWAPRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SWAP
#ifdef COMPLEX
#ifdef DOUBLE
#define SWAP BLASFUNC(zswap)
#else
#define SWAP BLASFUNC(cswap)
#endif
#else
#ifdef DOUBLE
#define SWAP BLASFUNC(dswap)
#else
#define SWAP BLASFUNC(sswap)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SWAP (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@@ -215,4 +215,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *c;
FLOAT alpha[] = {1.0, 1.0};
@@ -196,4 +196,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b;
FLOAT alpha[] = {1.0, 1.0};
@@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b;
FLOAT alpha[] = {1.0, 1.0};
@@ -130,11 +130,21 @@ int MAIN__(int argc, char *argv[]){
char trans='N';
char diag ='U';
int l;
int loops = 1;
double timeg;
if ((p = getenv("OPENBLAS_SIDE"))) side=*p;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
p = getenv("OPENBLAS_LOOPS");
if ( p != NULL )
loops = atoi(p);
blasint m, i, j;
int from = 1;
@@ -150,7 +160,7 @@ int MAIN__(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c\n", from, to, step,side,uplo,trans,diag);
fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c Loops = %d\n", from, to, step,side,uplo,trans,diag,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
@@ -171,32 +181,39 @@ int MAIN__(int argc, char *argv[]){
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
timeg=0.0;
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
fprintf(stderr, " %6d : ", (int)m);
gettimeofday( &start, (struct timezone *)0);
for (l=0; l<loops; l++)
{
TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
gettimeofday( &stop, (struct timezone *)0);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
gettimeofday( &start, (struct timezone *)0);
TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
time1 = timeg/loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

196
benchmark/zdot-intel.c Normal file
View File

@@ -0,0 +1,196 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#define RETURN_BY_STACK 1
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT _Complex result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
DOT (&result, &m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

195
benchmark/zdot.c Normal file
View File

@@ -0,0 +1,195 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT _Complex result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = DOT (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

11
c_check
View File

@@ -3,6 +3,10 @@
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
$hostarch = "x86_64" if ($hostarch eq "amd64");
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$binary = $ENV{"BINARY"};
@@ -27,7 +31,7 @@ if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
$cross_suffix = $1;
}
} else {
if ($ARGV[0] =~ /(.*-)(.*)/) {
if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) {
$cross_suffix = $1;
}
}
@@ -54,6 +58,7 @@ $os = osf if ($data =~ /OS_OSF/);
$os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
@@ -80,6 +85,10 @@ if (($architecture eq "mips32") || ($architecture eq "mips64")) {
$defined = 1;
}
if (($architecture eq "arm") || ($architecture eq "arm64")) {
$defined = 1;
}
if ($architecture eq "alpha") {
$defined = 1;
$binary = 64;

16
cblas.h
View File

@@ -13,6 +13,12 @@ extern "C" {
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
/*Get the number of threads on runtime.*/
int openblas_get_num_threads(void);
/*Get the number of physical processors (cores).*/
int openblas_get_num_procs(void);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
@@ -341,6 +347,16 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum
void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta,
float *c, OPENBLAS_CONST blasint cldc);
void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta,
double *c, OPENBLAS_CONST blasint cldc);
void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta,
float *c, OPENBLAS_CONST blasint cldc);
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
double *c, OPENBLAS_CONST blasint cldc);
#ifdef __cplusplus
}
#endif /* __cplusplus */

View File

@@ -1,334 +0,0 @@
#ifndef CBLAS_H
#define CBLAS_H
#include <stddef.h>
#include "common.h"
#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */
/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
/* Get the parallelization type which is used by OpenBLAS */
int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
#define OPENBLAS_OPENMP 2
#define CBLAS_INDEX size_t
typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy);
double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);
openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy);
openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy);
openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
float cblas_sasum (blasint n, float *x, blasint incx);
double cblas_dasum (blasint n, double *x, blasint incx);
float cblas_scasum(blasint n, float *x, blasint incx);
double cblas_dzasum(blasint n, double *x, blasint incx);
float cblas_snrm2 (blasint N, float *X, blasint incX);
double cblas_dnrm2 (blasint N, double *X, blasint incX);
float cblas_scnrm2(blasint N, float *X, blasint incX);
double cblas_dznrm2(blasint N, double *X, blasint incX);
CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx);
CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx);
CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);
void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy);
void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy);
void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy);
void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s);
void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);
void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);
void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);
void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy);
void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy);
void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy);
void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy);
void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
float *Y, blasint incY, float *A, blasint lda);
void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
double *Y, blasint incY, double *A, blasint lda);
void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);
void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);
void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);
void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);
void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...);
/*** BLAS extensions ***/
void cblas_saxpby(blasint n, float alpha, float *x, blasint incx,float beta, float *y, blasint incy);
void cblas_daxpby(blasint n, double alpha, double *x, blasint incx,double beta, double *y, blasint incy);
void cblas_caxpby(blasint n, float *alpha, float *x, blasint incx,float *beta, float *y, blasint incy);
void cblas_zaxpby(blasint n, double *alpha, double *x, blasint incx,double *beta, double *y, blasint incy);
void cblas_somatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a,
blasint clda, float *b, blasint cldb);
void cblas_domatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a,
blasint clda, double *b, blasint cldb);
void cblas_comatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a,
blasint clda, void *b, blasint cldb);
void cblas_zomatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a,
blasint clda, void *b, blasint cldb);
void cblas_simatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a,
blasint clda, blasint cldb);
void cblas_dimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a,
blasint clda, blasint cldb);
void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float* calpha, float* a,
blasint clda, blasint cldb);
void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a,
blasint clda, blasint cldb);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif

115
cmake/arch.cmake Normal file
View File

@@ -0,0 +1,115 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets various variables based on architecture.
if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64")
if (${ARCH} STREQUAL "x86")
if (NOT BINARY)
set(NO_BINARY_MODE 1)
endif ()
endif ()
if (NOT NO_EXPRECISION)
if (${F_COMPILER} MATCHES "GFORTRAN")
# N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB")
set(EXPRECISION 1)
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double")
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "Clang")
set(EXPRECISION 1)
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION")
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
endif ()
endif ()
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "Intel")
set(CCOMMON_OPT "${CCOMMON_OPT} -wd981")
endif ()
if (USE_OPENMP)
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB")
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "Clang")
message(WARNING "Clang doesn't support OpenMP yet.")
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "Intel")
set(CCOMMON_OPT "${CCOMMON_OPT} -openmp")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
set(CEXTRALIB "${CEXTRALIB} -lstdc++")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
endif ()
endif ()
if (DYNAMIC_ARCH)
if (${ARCH} STREQUAL "x86")
set(DYNAMIC_CORE "KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO")
endif ()
if (${ARCH} STREQUAL "x86_64")
set(DYNAMIC_CORE "PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO")
if (NOT NO_AVX)
set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER")
endif ()
if (NOT NO_AVX2)
set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL")
endif ()
endif ()
if (NOT DYNAMIC_CORE)
unset(DYNAMIC_ARCH)
endif ()
endif ()
if (${ARCH} STREQUAL "ia64")
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
if (${F_COMPILER} MATCHES "GFORTRAN")
if (${CMAKE_C_COMPILER} STREQUAL "GNU")
# EXPRECISION = 1
# CCOMMON_OPT += -DEXPRECISION
endif ()
endif ()
endif ()
if (${ARCH} STREQUAL "mips64")
set(NO_BINARY_MODE 1)
endif ()
if (${ARCH} STREQUAL "alpha")
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
endif ()
if (${ARCH} STREQUAL "arm")
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
endif ()
if (${ARCH} STREQUAL "arm64")
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
endif ()

89
cmake/c_check.cmake Normal file
View File

@@ -0,0 +1,89 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from the OpenBLAS/c_check perl script.
## This is triggered by prebuild.cmake and runs before any of the code is built.
## Creates config.h and Makefile.conf.
# CMake vars set by this file:
# OSNAME (use CMAKE_SYSTEM_NAME)
# ARCH
# C_COMPILER (use CMAKE_C_COMPILER)
# BINARY32
# BINARY64
# FU
# CROSS_SUFFIX
# CROSS
# CEXTRALIB
# Defines set by this file:
# OS_
# ARCH_
# C_
# __32BIT__
# __64BIT__
# FUNDERSCORE
# PTHREAD_CREATE_FUNC
# N.B. c_check (and ctest.c) is not cross-platform, so instead try to use CMake variables.
set(FU "")
if(APPLE)
set(FU "_")
elseif(MSVC)
set(FU "_")
elseif(UNIX)
set(FU "")
endif()
# Convert CMake vars into the format that OpenBLAS expects
string(TOUPPER ${CMAKE_SYSTEM_NAME} HOST_OS)
if (${HOST_OS} STREQUAL "WINDOWS")
set(HOST_OS WINNT)
endif ()
# added by hpa - check size of void ptr to detect 64-bit compile
if (NOT DEFINED BINARY)
set(BINARY 32)
if (CMAKE_SIZEOF_VOID_P EQUAL 8)
set(BINARY 64)
endif ()
endif ()
if (BINARY EQUAL 64)
set(BINARY64 1)
else ()
set(BINARY32 1)
endif ()
# CMake docs define these:
# CMAKE_SYSTEM_PROCESSOR - The name of the CPU CMake is building for.
# CMAKE_HOST_SYSTEM_PROCESSOR - The name of the CPU CMake is running on.
#
# TODO: CMAKE_SYSTEM_PROCESSOR doesn't seem to be correct - instead get it from the compiler a la c_check
set(ARCH ${CMAKE_SYSTEM_PROCESSOR})
if (${ARCH} STREQUAL "AMD64")
set(ARCH "x86_64")
endif ()
# If you are using a 32-bit compiler on a 64-bit system CMAKE_SYSTEM_PROCESSOR will be wrong
if (${ARCH} STREQUAL "x86_64" AND BINARY EQUAL 32)
set(ARCH x86)
endif ()
if (${ARCH} STREQUAL "X86")
set(ARCH x86)
endif ()
set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU")
set(COMPILER_ID "GCC")
endif ()
string(TOUPPER ${ARCH} UC_ARCH)
file(WRITE ${TARGET_CONF}
"#define OS_${HOST_OS}\t1\n"
"#define ARCH_${UC_ARCH}\t1\n"
"#define C_${COMPILER_ID}\t1\n"
"#define __${BINARY}BIT__\t1\n"
"#define FUNDERSCORE\t${FU}\n")

103
cmake/cc.cmake Normal file
View File

@@ -0,0 +1,103 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets C related variables.
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
set(NO_UNINITIALIZED_WARN "-Wno-uninitialized")
if (QUIET_MAKE)
set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused")
endif ()
if (NO_BINARY_MODE)
if (${ARCH} STREQUAL "mips64")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=n32")
endif ()
set(BINARY_DEFINED 1)
endif ()
if (${CORE} STREQUAL "LOONGSON3A")
set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64")
set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64")
endif ()
if (${CORE} STREQUAL "LOONGSON3B")
set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64")
set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64")
endif ()
if (${OSNAME} STREQUAL "AIX")
set(BINARY_DEFINED 1)
endif ()
endif ()
if (NOT BINARY_DEFINED)
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
endif ()
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
if (${ARCH} STREQUAL "mips64")
if (NOT BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -n32")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -n64")
endif ()
if (${CORE} STREQUAL "LOONGSON3A")
set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static")
endif ()
if (${CORE} STREQUAL "LOONGSON3B")
set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static")
endif ()
else ()
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
endif ()
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "SUN")
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
if (${ARCH} STREQUAL "x86")
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
endif ()
endif ()

60
cmake/export.cmake Normal file
View File

@@ -0,0 +1,60 @@
#Only generate .def for dll on MSVC
if(MSVC)
set_source_files_properties(${OpenBLAS_DEF_FILE} PROPERTIES GENERATED 1)
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
set(ARCH_IN ${ARCH})
endif()
if (${CORE} STREQUAL "generic")
set(ARCH_IN "GENERIC")
endif ()
if (NOT DEFINED EXPRECISION)
set(EXPRECISION_IN 0)
else()
set(EXPRECISION_IN ${EXPRECISION})
endif()
if (NOT DEFINED NO_CBLAS)
set(NO_CBLAS_IN 0)
else()
set(NO_CBLAS_IN ${NO_CBLAS})
endif()
if (NOT DEFINED NO_LAPACK)
set(NO_LAPACK_IN 0)
else()
set(NO_LAPACK_IN ${NO_LAPACK})
endif()
if (NOT DEFINED NO_LAPACKE)
set(NO_LAPACKE_IN 0)
else()
set(NO_LAPACKE_IN ${NO_LAPACKE})
endif()
if (NOT DEFINED NEED2UNDERSCORES)
set(NEED2UNDERSCORES_IN 0)
else()
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
endif()
if (NOT DEFINED ONLY_CBLAS)
set(ONLY_CBLAS_IN 0)
else()
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
endif()
add_custom_command(
TARGET ${OpenBLAS_LIBNAME} PRE_LINK
COMMAND perl
ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
COMMENT "Create openblas.def file"
VERBATIM)
endif()

66
cmake/f_check.cmake Normal file
View File

@@ -0,0 +1,66 @@
##
## Author: Hank Anderson <hank@statease.com>
## Copyright: (c) Stat-Ease, Inc.
## Created: 12/29/14
## Last Modified: 12/29/14
## Description: Ported from the OpenBLAS/f_check perl script.
## This is triggered by prebuild.cmake and runs before any of the code is built.
## Appends Fortran information to config.h and Makefile.conf.
# CMake vars set by this file:
# F_COMPILER
# FC
# BU
# NOFORTRAN
# NEED2UNDERSCORES
# FEXTRALIB
# Defines set by this file:
# BUNDERSCORE
# NEEDBUNDERSCORE
# NEED2UNDERSCORES
if (MSVC)
# had to do this for MSVC, else CMake automatically assumes I have ifort... -hpa
include(CMakeForceCompiler)
CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)
endif ()
if (NOT NO_LAPACK)
enable_language(Fortran)
else()
include(CMakeForceCompiler)
CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)
endif()
if (NOT ONLY_CBLAS)
# N.B. f_check is not cross-platform, so instead try to use CMake variables
# run f_check (appends to TARGET files)
# message(STATUS "Running f_check...")
# execute_process(COMMAND perl f_check ${TARGET_MAKE} ${TARGET_CONF} ${CMAKE_Fortran_COMPILER}
# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
# TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile
# TODO: set FEXTRALIB flags a la f_check?
set(BU "_")
file(APPEND ${TARGET_CONF}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n"
"#define NEED2UNDERSCORES 0\n")
else ()
#When we only build CBLAS, we set NOFORTRAN=2
set(NOFORTRAN 2)
set(NO_FBLAS 1)
#set(F_COMPILER GFORTRAN) # CMake handles the fortran compiler
set(BU "_")
file(APPEND ${TARGET_CONF}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n")
endif()
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
string(TOUPPER ${F_COMPILER} F_COMPILER)

200
cmake/fc.cmake Normal file
View File

@@ -0,0 +1,200 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables.
if (${F_COMPILER} STREQUAL "G77")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77")
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
if (NOT NO_BINARY_MODE)
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
endif ()
endif ()
endif ()
if (${F_COMPILER} STREQUAL "G95")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G95")
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
if (NOT NO_BINARY_MODE)
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
endif ()
endif ()
endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran")
endif ()
if (NO_BINARY_MODE)
if (${ARCH} STREQUAL "mips64")
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
endif ()
endif ()
else ()
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
endif ()
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "INTEL")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL")
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "FUJITSU")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "IBM")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM")
# FCOMMON_OPT += -qarch=440
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -q64")
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -qintsize=8")
endif ()
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -q32")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "PGI")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI")
set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER")
if (BINARY64)
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7-64")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -mp")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "PATHSCALE")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PATHSCALE")
if (BINARY64)
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
endif ()
endif ()
if (NOT ${ARCH} STREQUAL "mips64")
if (NOT BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
endif ()
else ()
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
endif ()
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -mp")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "OPEN64")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_OPEN64")
if (BINARY64)
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
endif ()
endif ()
if (${ARCH} STREQUAL "mips64")
if (NOT BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -n32")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -n64")
endif ()
if (${CORE} STREQUAL "LOONGSON3A")
set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static")
endif ()
if (${CORE} STREQUAL "LOONGSON3B")
set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static")
endif ()
else ()
if (NOT BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
endif ()
endif ()
if (USE_OPENMP)
set(FEXTRALIB "${FEXTRALIB} -lstdc++")
set(FCOMMON_OPT "${FCOMMON_OPT} -mp")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "SUN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_SUN")
if (${ARCH} STREQUAL "x86")
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "COMPAQ")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
endif ()
endif ()
# from the root Makefile - this is for lapack-netlib to compile the correct secnd file.
if (${F_COMPILER} STREQUAL "GFORTRAN")
set(TIMER "INT_ETIME")
else ()
set(TIMER "NONE")
endif ()

165
cmake/kernel.cmake Normal file
View File

@@ -0,0 +1,165 @@
# helper functions for the kernel CMakeLists.txt
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
macro(SetDefaultL1)
set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S)
set(QAMAXKERNEL amax.S)
set(CAMAXKERNEL zamax.S)
set(ZAMAXKERNEL zamax.S)
set(XAMAXKERNEL zamax.S)
set(SAMINKERNEL amin.S)
set(DAMINKERNEL amin.S)
set(QAMINKERNEL amin.S)
set(CAMINKERNEL zamin.S)
set(ZAMINKERNEL zamin.S)
set(XAMINKERNEL zamin.S)
set(SMAXKERNEL max.S)
set(DMAXKERNEL max.S)
set(QMAXKERNEL max.S)
set(SMINKERNEL min.S)
set(DMINKERNEL min.S)
set(QMINKERNEL min.S)
set(ISAMAXKERNEL iamax.S)
set(IDAMAXKERNEL iamax.S)
set(IQAMAXKERNEL iamax.S)
set(ICAMAXKERNEL izamax.S)
set(IZAMAXKERNEL izamax.S)
set(IXAMAXKERNEL izamax.S)
set(ISAMINKERNEL iamin.S)
set(IDAMINKERNEL iamin.S)
set(IQAMINKERNEL iamin.S)
set(ICAMINKERNEL izamin.S)
set(IZAMINKERNEL izamin.S)
set(IXAMINKERNEL izamin.S)
set(ISMAXKERNEL iamax.S)
set(IDMAXKERNEL iamax.S)
set(IQMAXKERNEL iamax.S)
set(ISMINKERNEL iamin.S)
set(IDMINKERNEL iamin.S)
set(IQMINKERNEL iamin.S)
set(SASUMKERNEL asum.S)
set(DASUMKERNEL asum.S)
set(CASUMKERNEL zasum.S)
set(ZASUMKERNEL zasum.S)
set(QASUMKERNEL asum.S)
set(XASUMKERNEL zasum.S)
set(SAXPYKERNEL axpy.S)
set(DAXPYKERNEL axpy.S)
set(CAXPYKERNEL zaxpy.S)
set(ZAXPYKERNEL zaxpy.S)
set(QAXPYKERNEL axpy.S)
set(XAXPYKERNEL zaxpy.S)
set(SCOPYKERNEL copy.S)
set(DCOPYKERNEL copy.S)
set(CCOPYKERNEL zcopy.S)
set(ZCOPYKERNEL zcopy.S)
set(QCOPYKERNEL copy.S)
set(XCOPYKERNEL zcopy.S)
set(SDOTKERNEL dot.S)
set(DDOTKERNEL dot.S)
set(CDOTKERNEL zdot.S)
set(ZDOTKERNEL zdot.S)
set(QDOTKERNEL dot.S)
set(XDOTKERNEL zdot.S)
set(SNRM2KERNEL nrm2.S)
set(DNRM2KERNEL nrm2.S)
set(QNRM2KERNEL nrm2.S)
set(CNRM2KERNEL znrm2.S)
set(ZNRM2KERNEL znrm2.S)
set(XNRM2KERNEL znrm2.S)
set(SROTKERNEL rot.S)
set(DROTKERNEL rot.S)
set(QROTKERNEL rot.S)
set(CROTKERNEL zrot.S)
set(ZROTKERNEL zrot.S)
set(XROTKERNEL zrot.S)
set(SSCALKERNEL scal.S)
set(DSCALKERNEL scal.S)
set(CSCALKERNEL zscal.S)
set(ZSCALKERNEL zscal.S)
set(QSCALKERNEL scal.S)
set(XSCALKERNEL zscal.S)
set(SSWAPKERNEL swap.S)
set(DSWAPKERNEL swap.S)
set(CSWAPKERNEL zswap.S)
set(ZSWAPKERNEL zswap.S)
set(QSWAPKERNEL swap.S)
set(XSWAPKERNEL zswap.S)
set(SGEMVNKERNEL gemv_n.S)
set(SGEMVTKERNEL gemv_t.S)
set(DGEMVNKERNEL gemv_n.S)
set(DGEMVTKERNEL gemv_t.S)
set(CGEMVNKERNEL zgemv_n.S)
set(CGEMVTKERNEL zgemv_t.S)
set(ZGEMVNKERNEL zgemv_n.S)
set(ZGEMVTKERNEL zgemv_t.S)
set(QGEMVNKERNEL gemv_n.S)
set(QGEMVTKERNEL gemv_t.S)
set(XGEMVNKERNEL zgemv_n.S)
set(XGEMVTKERNEL zgemv_t.S)
set(SCABS_KERNEL ../generic/cabs.c)
set(DCABS_KERNEL ../generic/cabs.c)
set(QCABS_KERNEL ../generic/cabs.c)
set(LSAME_KERNEL ../generic/lsame.c)
set(SAXPBYKERNEL ../arm/axpby.c)
set(DAXPBYKERNEL ../arm/axpby.c)
set(CAXPBYKERNEL ../arm/zaxpby.c)
set(ZAXPBYKERNEL ../arm/zaxpby.c)
endmacro ()
macro(SetDefaultL2)
set(SGEMVNKERNEL gemv_n.S)
set(SGEMVTKERNEL gemv_t.S)
set(DGEMVNKERNEL gemv_n.S)
set(DGEMVTKERNEL gemv_t.S)
set(CGEMVNKERNEL zgemv_n.S)
set(CGEMVTKERNEL zgemv_t.S)
set(ZGEMVNKERNEL zgemv_n.S)
set(ZGEMVTKERNEL zgemv_t.S)
set(QGEMVNKERNEL gemv_n.S)
set(QGEMVTKERNEL gemv_t.S)
set(XGEMVNKERNEL zgemv_n.S)
set(XGEMVTKERNEL zgemv_t.S)
set(SGERKERNEL ../generic/ger.c)
set(DGERKERNEL ../generic/ger.c)
set(QGERKERNEL ../generic/ger.c)
set(CGERUKERNEL ../generic/zger.c)
set(CGERCKERNEL ../generic/zger.c)
set(ZGERUKERNEL ../generic/zger.c)
set(ZGERCKERNEL ../generic/zger.c)
set(XGERUKERNEL ../generic/zger.c)
set(XGERCKERNEL ../generic/zger.c)
set(SSYMV_U_KERNEL ../generic/symv_k.c)
set(SSYMV_L_KERNEL ../generic/symv_k.c)
set(DSYMV_U_KERNEL ../generic/symv_k.c)
set(DSYMV_L_KERNEL ../generic/symv_k.c)
set(QSYMV_U_KERNEL ../generic/symv_k.c)
set(QSYMV_L_KERNEL ../generic/symv_k.c)
set(CSYMV_U_KERNEL ../generic/zsymv_k.c)
set(CSYMV_L_KERNEL ../generic/zsymv_k.c)
set(ZSYMV_U_KERNEL ../generic/zsymv_k.c)
set(ZSYMV_L_KERNEL ../generic/zsymv_k.c)
set(XSYMV_U_KERNEL ../generic/zsymv_k.c)
set(XSYMV_L_KERNEL ../generic/zsymv_k.c)
set(CHEMV_U_KERNEL ../generic/zhemv_k.c)
set(CHEMV_L_KERNEL ../generic/zhemv_k.c)
set(CHEMV_V_KERNEL ../generic/zhemv_k.c)
set(CHEMV_M_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_U_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_L_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_V_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_M_KERNEL ../generic/zhemv_k.c)
set(XHEMV_U_KERNEL ../generic/zhemv_k.c)
set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
endmacro ()
macro(SetDefaultL3)
set(SGEADD_KERNEL ../generic/geadd.c)
set(DGEADD_KERNEL ../generic/geadd.c)
set(CGEADD_KERNEL ../generic/zgeadd.c)
set(ZGEADD_KERNEL ../generic/zgeadd.c)
endmacro ()

347
cmake/lapack.cmake Normal file
View File

@@ -0,0 +1,347 @@
# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files.
set(ALLAUX
ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f
ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f
../INSTALL/ilaver.f ../INSTALL/slamch.f
)
set(SCLAUX
sbdsdc.f
sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f
slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f
slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f
slagts.f slamrg.f slanst.f
slapy2.f slapy3.f slarnv.f
slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f
slarrk.f slarrr.f slaneg.f
slartg.f slaruv.f slas2.f slascl.f
slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f
slasd7.f slasd8.f slasda.f slasdq.f slasdt.f
slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f
slasr.f slasrt.f slassq.f slasv2.f spttrf.f sstebz.f sstedc.f
ssteqr.f ssterf.f slaisnan.f sisnan.f
slartgp.f slartgs.f
../INSTALL/second_${TIMER}.f
)
set(DZLAUX
dbdsdc.f
dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f
dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f
dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f
dlagts.f dlamrg.f dlanst.f
dlapy2.f dlapy3.f dlarnv.f
dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f
dlarrk.f dlarrr.f dlaneg.f
dlartg.f dlaruv.f dlas2.f dlascl.f
dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f
dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f
dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f
dsteqr.f dsterf.f dlaisnan.f disnan.f
dlartgp.f dlartgs.f
../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f
)
set(SLASRC
sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
sgegs.f sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
sgels.f sgelsd.f sgelss.f sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
sgeqp3.f sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f
sgetc2.f sgetri.f
sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f
sggglm.f sgghrd.f sgglse.f sggqrf.f
sggrqf.f sggsvd.f sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
shsein.f shseqr.f slabrd.f slacon.f slacn2.f
slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f
slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
slansy.f slantb.f slantp.f slantr.f slanv2.f
slapll.f slapmt.f
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slargv.f
slarrv.f slartv.f
slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f slatzm.f
sopgtr.f sopmtr.f sorg2l.f sorg2r.f
sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
sorgrq.f sorgtr.f sorm2l.f sorm2r.f
sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f
sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f
spbstf.f spbsv.f spbsvx.f
spbtf2.f spbtrf.f spbtrs.f spocon.f spoequ.f sporfs.f sposv.f
sposvx.f spstrf.f spstf2.f
sppcon.f sppequ.f
spprfs.f sppsv.f sppsvx.f spptrf.f spptri.f spptrs.f sptcon.f
spteqr.f sptrfs.f sptsv.f sptsvx.f spttrs.f sptts2.f srscl.f
ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f
ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f
sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f
ssptrf.f ssptri.f ssptrs.f sstegr.f sstein.f sstev.f sstevd.f sstevr.f
sstevx.f
ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f
ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f
ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f
ssyswapr.f ssytrs.f ssytrs2.f ssyconv.f
ssytf2_rook.f ssytrf_rook.f ssytrs_rook.f
ssytri_rook.f ssycon_rook.f ssysv_rook.f
stbcon.f
stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f
stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f
stptrs.f
strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
strtrs.f stzrqf.f stzrzf.f sstemr.f
slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f
stfttr.f stpttf.f stpttr.f strttf.f strttp.f
sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f
sgeequb.f ssyequb.f spoequb.f sgbequb.f
sbbcsd.f slapmr.f sorbdb.f sorbdb1.f sorbdb2.f sorbdb3.f sorbdb4.f
sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f
sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
stpqrt.f stpqrt2.f stpmqrt.f stprfb.f spotri.f
)
set(DSLASRC spotrs.f)
set(CLASRC
cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f
cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f
cgegs.f cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f
cgels.f cgelsd.f cgelss.f cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f
cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f
cgesvx.f cgetc2.f cgetri.f
cggbak.f cggbal.f cgges.f cggesx.f cggev.f cggevx.f cggglm.f
cgghrd.f cgglse.f cggqrf.f cggrqf.f
cggsvd.f cggsvp.f
cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f
chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f
checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f
chegv.f chegvd.f chegvx.f cherfs.f chesv.f chesvx.f chetd2.f
chetf2.f chetrd.f
chetrf.f chetri.f chetri2.f chetri2x.f cheswapr.f
chetrs.f chetrs2.f
chetf2_rook.f chetrf_rook.f chetri_rook.f chetrs_rook.f checon_rook.f chesv_rook.f
chgeqz.f chpcon.f chpev.f chpevd.f
chpevx.f chpgst.f chpgv.f chpgvd.f chpgvx.f chprfs.f chpsv.f
chpsvx.f
chptrd.f chptrf.f chptri.f chptrs.f chsein.f chseqr.f clabrd.f
clacgv.f clacon.f clacn2.f clacp2.f clacpy.f clacrm.f clacrt.f cladiv.f
claed0.f claed7.f claed8.f
claein.f claesy.f claev2.f clags2.f clagtm.f
clahef.f clahef_rook.f clahqr.f
clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
clanhb.f clanhe.f
clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
clarf.f clarfb.f clarfg.f clarft.f clarfgp.f
clarfx.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
clasyf.f clasyf_rook.f clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f
clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f
cposv.f cposvx.f cpstrf.f cpstf2.f
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f
crot.f cspcon.f csprfs.f cspsv.f
cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
cstegr.f cstein.f csteqr.f
csycon.f
csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f csytri2.f csytri2x.f
csyswapr.f csytrs.f csytrs2.f csyconv.f
csytf2_rook.f csytrf_rook.f csytrs_rook.f
csytri_rook.f csycon_rook.f csysv_rook.f
ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f
ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f
ctprfs.f ctptri.f
ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
ctrsyl.f ctrtrs.f ctzrqf.f ctzrzf.f cung2l.f cung2r.f
cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f
cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f
cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f
chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f
ctfttr.f ctpttf.f ctpttr.f ctrttf.f ctrttp.f
cgeequb.f cgbequb.f csyequb.f cpoequb.f cheequb.f
cbbcsd.f clapmr.f cunbdb.f cunbdb1.f cunbdb2.f cunbdb3.f cunbdb4.f
cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f
cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cpotri.f
)
set(ZCLASRC cpotrs.f)
set(DLASRC
dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f
dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f
dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f
dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f
dgetc2.f dgetri.f
dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f
dggglm.f dgghrd.f dgglse.f dggqrf.f
dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f
dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
dlapll.f dlapmt.f
dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f
dlargv.f dlarrv.f dlartv.f
dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f
dopgtr.f dopmtr.f dorg2l.f dorg2r.f
dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
dorgrq.f dorgtr.f dorm2l.f dorm2r.f
dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f
dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f
dpbstf.f dpbsv.f dpbsvx.f
dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f
dposvx.f dpotrs.f dpstrf.f dpstf2.f
dppcon.f dppequ.f
dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f
dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f
dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f
dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f
dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f
dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f
dstevx.f
dsycon.f dsyev.f dsyevd.f dsyevr.f
dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f
dsysv.f dsysvx.f
dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytri2.f dsytri2x.f
dsyswapr.f dsytrs.f dsytrs2.f dsyconv.f
dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f
dsytri_rook.f dsycon_rook.f dsysv_rook.f
dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f
dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f
dtptrs.f
dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f
dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f
dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f
dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f
dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f
dgeequb.f dsyequb.f dpoequb.f dgbequb.f
dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f
dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f
dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f
)
set(ZLASRC
zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
zgegs.f zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
zgels.f zgelsd.f zgelss.f zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f
zgetri.f
zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f
zgghrd.f zgglse.f zggqrf.f zggrqf.f
zggsvd.f zggsvp.f
zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f
zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f
zhegv.f zhegvd.f zhegvx.f zherfs.f zhesv.f zhesvx.f zhetd2.f
zhetf2.f zhetrd.f
zhetrf.f zhetri.f zhetri2.f zhetri2x.f zheswapr.f
zhetrs.f zhetrs2.f
zhetf2_rook.f zhetrf_rook.f zhetri_rook.f zhetrs_rook.f zhecon_rook.f zhesv_rook.f
zhgeqz.f zhpcon.f zhpev.f zhpevd.f
zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f
zhpsvx.f
zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f
zlacgv.f zlacon.f zlacn2.f zlacp2.f zlacpy.f zlacrm.f zlacrt.f zladiv.f
zlaed0.f zlaed7.f zlaed8.f
zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f
zlahef.f zlahef_rook.f zlahqr.f
zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
zlangt.f zlanhb.f
zlanhe.f
zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
zlarcm.f zlarf.f zlarfb.f
zlarfg.f zlarft.f zlarfgp.f
zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
zlassq.f zlasyf.f zlasyf_rook.f
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlatzm.f
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f
zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f
zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f
zrot.f zspcon.f zsprfs.f zspsv.f
zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
zstegr.f zstein.f zsteqr.f
zsycon.f
zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f zsytri2.f zsytri2x.f
zsyswapr.f zsytrs.f zsytrs2.f zsyconv.f
zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f
zsytri_rook.f zsycon_rook.f zsysv_rook.f
ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f
ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f
ztprfs.f ztptri.f
ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
ztrsyl.f ztrtrs.f ztzrqf.f ztzrzf.f zung2l.f
zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f
zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f
zunmtr.f zupgtr.f
zupmtr.f izmax1.f dzsum1.f zstemr.f
zcgesv.f zcposv.f zlag2c.f clag2z.f zlat2c.f
zhfrk.f ztfttp.f zlanhf.f zpftrf.f zpftri.f zpftrs.f ztfsm.f ztftri.f
ztfttr.f ztpttf.f ztpttr.f ztrttf.f ztrttp.f
zgeequb.f zgbequb.f zsyequb.f zpoequb.f zheequb.f
zbbcsd.f zlapmr.f zunbdb.f zunbdb1.f zunbdb2.f zunbdb3.f zunbdb4.f
zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f
zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f
ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f zpotri.f
)
set(LA_REL_SRC ${ALLAUX})
if (BUILD_SINGLE)
list(APPEND LA_REL_SRC ${SLASRC} ${DSLASRC} ${SCLAUX})
endif ()
if (BUILD_DOUBLE)
list(APPEND LA_REL_SRC ${DLASRC} ${DSLASRC} ${DZLAUX})
endif ()
if (BUILD_COMPLEX)
list(APPEND LA_REL_SRC ${CLASRC} ${ZCLASRC} ${SCLAUX})
endif ()
if (BUILD_COMPLEX16)
list(APPEND LA_REL_SRC ${ZLASRC} ${ZCLASRC} ${DZLAUX})
endif ()
# add lapack-netlib folder to the sources
set(LA_SOURCES "")
foreach (LA_FILE ${LA_REL_SRC})
list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}")
endforeach ()
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")

2067
cmake/lapacke.cmake Normal file

File diff suppressed because it is too large Load Diff

104
cmake/os.cmake Normal file
View File

@@ -0,0 +1,104 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from portion of OpenBLAS/Makefile.system
## Detects the OS and sets appropriate variables.
if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
set(ENV{MACOSX_DEPLOYMENT_TARGET} "10.2") # TODO: should be exported as an env var
set(MD5SUM "md5 -r")
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "FreeBSD")
set(MD5SUM "md5 -r")
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "NetBSD")
set(MD5SUM "md5 -n")
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
set(EXTRALIB "${EXTRALIB} -lm")
set(NO_EXPRECISION 1)
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX")
set(EXTRALIB "${EXTRALIB} -lm")
endif ()
# TODO: this is probably meant for mingw, not other windows compilers
if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(NEED_PIC 0)
set(NO_EXPRECISION 1)
set(EXTRALIB "${EXTRALIB} -defaultlib:advapi32")
# probably not going to use these
set(SUFFIX "obj")
set(PSUFFIX "pobj")
set(LIBSUFFIX "a")
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
# Test for supporting MS_ABI
# removed string parsing in favor of CMake's version comparison -hpa
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
# GCC Version >=4.7
# It is compatible with MSVC ABI.
set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI")
endif ()
endif ()
# Ensure the correct stack alignment on Win32
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
if (${ARCH} STREQUAL "x86")
if (NOT MSVC AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
set(CCOMMON_OPT "${CCOMMON_OPT} -mincoming-stack-boundary=2")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -mincoming-stack-boundary=2")
endif ()
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
set(NEED_PIC 0)
set(NO_EXPRECISION 1)
set(INTERIX_TOOL_DIR STREQUAL "/opt/gcc.3.3/i586-pc-interix3/bin")
endif ()
if (CYGWIN)
set(NEED_PIC 0)
set(NO_EXPRECISION 1)
endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
if (SMP)
set(EXTRALIB "${EXTRALIB} -lpthread")
endif ()
endif ()
if (QUAD_PRECISION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DQUAD_PRECISION")
set(NO_EXPRECISION 1)
endif ()
if (${ARCH} STREQUAL "x86")
set(NO_EXPRECISION 1)
endif ()
if (UTEST_CHECK)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK")
set(SANITY_CHECK 1)
endif ()
if (SANITY_CHECK)
# TODO: need some way to get $(*F) (target filename)
set(CCOMMON_OPT "${CCOMMON_OPT} -DSANITY_CHECK -DREFNAME=$(*F)f${BU}")
endif ()

113
cmake/prebuild.cmake Normal file
View File

@@ -0,0 +1,113 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from OpenBLAS/Makefile.prebuild
## This is triggered by system.cmake and runs before any of the code is built.
## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files).
## Next it runs f_check and appends some fortran information to the files.
## Finally it runs getarch and getarch_2nd for even more environment information.
# CMake vars set by this file:
# CORE
# LIBCORE
# NUM_CORES
# HAVE_MMX
# HAVE_SSE
# HAVE_SSE2
# HAVE_SSE3
# MAKE
# SGEMM_UNROLL_M
# SGEMM_UNROLL_N
# DGEMM_UNROLL_M
# DGEMM_UNROLL_M
# QGEMM_UNROLL_N
# QGEMM_UNROLL_N
# CGEMM_UNROLL_M
# CGEMM_UNROLL_M
# ZGEMM_UNROLL_N
# ZGEMM_UNROLL_N
# XGEMM_UNROLL_M
# XGEMM_UNROLL_N
# CGEMM3M_UNROLL_M
# CGEMM3M_UNROLL_N
# ZGEMM3M_UNROLL_M
# ZGEMM3M_UNROLL_M
# XGEMM3M_UNROLL_N
# XGEMM3M_UNROLL_N
# CPUIDEMU = ../../cpuid/table.o
if (DEFINED CPUIDEMU)
set(EXFLAGS "-DCPUIDEMU -DVENDOR=99")
endif ()
if (DEFINED TARGET_CORE)
# set the C flags for just this file
set(GETARCH2_FLAGS "-DBUILD_KERNEL")
set(TARGET_MAKE "Makefile_kernel.conf")
set(TARGET_CONF "config_kernel.h")
else()
set(TARGET_MAKE "Makefile.conf")
set(TARGET_CONF "config.h")
endif ()
include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake")
if (NOT NOFORTRAN)
include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake")
endif ()
# compile getarch
set(GETARCH_SRC
${CMAKE_SOURCE_DIR}/getarch.c
${CPUIDEMO}
)
if (NOT MSVC)
list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S)
endif ()
if (MSVC)
#Use generic for MSVC now
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
endif()
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH_DIR})
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
message(STATUS "Running getarch")
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 0 OUTPUT_VARIABLE GETARCH_MAKE_OUT)
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 1 OUTPUT_VARIABLE GETARCH_CONF_OUT)
message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}")
# append config data from getarch to the TARGET file and read in CMake vars
file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT})
ParseGetArchVars(${GETARCH_MAKE_OUT})
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH2_DIR})
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
)
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT)
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT)
# append config data from getarch_2nd to the TARGET file and read in CMake vars
file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT})
ParseGetArchVars(${GETARCH2_MAKE_OUT})

552
cmake/system.cmake Normal file
View File

@@ -0,0 +1,552 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from OpenBLAS/Makefile.system
##
set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib")
# TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa
# http://stackoverflow.com/questions/714100/os-detecting-makefile
# TODO: Makefile.system sets HOSTCC = $(CC) here if not already set -hpa
# TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
if (DEFINED TARGET_CORE)
set(TARGET ${TARGET_CORE})
endif ()
# Force fallbacks for 32bit
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
message(STATUS "Compiling a ${BINARY}-bit binary.")
set(NO_AVX 1)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
set(TARGET "NEHALEM")
endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER")
set(TARGET "BARCELONA")
endif ()
endif ()
if (DEFINED TARGET)
message(STATUS "Targetting the ${TARGET} architecture.")
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif ()
if (INTERFACE64)
message(STATUS "Using 64-bit integers.")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
endif ()
if (NOT DEFINED GEMM_MULTITHREAD_THRESHOLD)
set(GEMM_MULTITHREAD_THRESHOLD 4)
endif ()
message(STATUS "GEMM multithread threshold set to ${GEMM_MULTITHREAD_THRESHOLD}.")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DGEMM_MULTITHREAD_THRESHOLD=${GEMM_MULTITHREAD_THRESHOLD}")
if (NO_AVX)
message(STATUS "Disabling Advanced Vector Extensions (AVX).")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX")
endif ()
if (NO_AVX2)
message(STATUS "Disabling Advanced Vector Extensions 2 (AVX2).")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX2")
endif ()
if (CMAKE_BUILD_TYPE STREQUAL Debug)
set(GETARCH_FLAGS "${GETARCH_FLAGS} -g")
endif ()
# TODO: let CMake handle this? -hpa
#if (${QUIET_MAKE})
# set(MAKE "${MAKE} -s")
#endif()
if (NOT DEFINED NO_PARALLEL_MAKE)
set(NO_PARALLEL_MAKE 0)
endif ()
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_PARALLEL_MAKE=${NO_PARALLEL_MAKE}")
if (CMAKE_CXX_COMPILER STREQUAL loongcc)
set(GETARCH_FLAGS "${GETARCH_FLAGS} -static")
endif ()
#if don't use Fortran, it will only compile CBLAS.
if (ONLY_CBLAS)
set(NO_LAPACK 1)
else ()
set(ONLY_CBLAS 0)
endif ()
include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake")
if (NOT DEFINED NUM_THREADS)
set(NUM_THREADS ${NUM_CORES})
endif ()
if (${NUM_THREADS} EQUAL 1)
set(USE_THREAD 0)
endif ()
if (DEFINED USE_THREAD)
if (NOT ${USE_THREAD})
unset(SMP)
else ()
set(SMP 1)
endif ()
else ()
# N.B. this is NUM_THREAD in Makefile.system which is probably a bug -hpa
if (${NUM_THREADS} EQUAL 1)
unset(SMP)
else ()
set(SMP 1)
endif ()
endif ()
if (${SMP})
message(STATUS "SMP enabled.")
endif ()
if (NOT DEFINED NEED_PIC)
set(NEED_PIC 1)
endif ()
# TODO: I think CMake should be handling all this stuff -hpa
unset(ARFLAGS)
set(CPP "${COMPILER} -E")
set(AR "${CROSS_SUFFIX}ar")
set(AS "${CROSS_SUFFIX}as")
set(LD "${CROSS_SUFFIX}ld")
set(RANLIB "${CROSS_SUFFIX}ranlib")
set(NM "${CROSS_SUFFIX}nm")
set(DLLWRAP "${CROSS_SUFFIX}dllwrap")
set(OBJCOPY "${CROSS_SUFFIX}objcopy")
set(OBJCONV "${CROSS_SUFFIX}objconv")
# OS dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/os.cmake")
# Architecture dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake")
# C Compiler dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake")
endif ()
if (BINARY64)
if (INTERFACE64)
# CCOMMON_OPT += -DUSE64BITINT
endif ()
endif ()
if (NEED_PIC)
if (${CMAKE_C_COMPILER} STREQUAL "IBM")
set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -fPIC")
endif ()
if (${F_COMPILER} STREQUAL "SUN")
set(FCOMMON_OPT "${FCOMMON_OPT} -pic")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC")
endif ()
endif ()
if (DYNAMIC_ARCH)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
endif ()
if (NO_LAPACK)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK")
#Disable LAPACK C interface
set(NO_LAPACKE 1)
endif ()
if (NO_LAPACKE)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACKE")
endif ()
if (NO_AVX)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX")
endif ()
if (${ARCH} STREQUAL "x86")
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX")
endif ()
if (NO_AVX2)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
endif ()
if (SMP)
set(CCOMMON_OPT "${CCOMMON_OPT} -DSMP_SERVER")
if (${ARCH} STREQUAL "mips64")
if (NOT ${CORE} STREQUAL "LOONGSON3B")
set(USE_SIMPLE_THREADED_LEVEL3 1)
endif ()
endif ()
if (USE_OPENMP)
# USE_SIMPLE_THREADED_LEVEL3 = 1
# NO_AFFINITY = 1
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP")
endif ()
if (BIGNUMA)
set(CCOMMON_OPT "${CCOMMON_OPT} -DBIGNUMA")
endif ()
endif ()
if (NO_WARMUP)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_WARMUP")
endif ()
if (CONSISTENT_FPCSR)
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
endif ()
# Only for development
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
# set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_SWITCHING")
# set(USE_PAPI 1)
if (USE_PAPI)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_PAPI")
set(EXTRALIB "${EXTRALIB} -lpapi -lperfctr")
endif ()
if (DYNAMIC_THREADS)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_THREADS")
endif ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
if (USE_SIMPLE_THREADED_LEVEL3)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
endif ()
if (DEFINED LIBNAMESUFFIX)
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
else ()
set(LIBPREFIX "libopenblas")
endif ()
if (NOT DEFINED SYMBOLPREFIX)
set(SYMBOLPREFIX "")
endif ()
if (NOT DEFINED SYMBOLSUFFIX)
set(SYMBOLSUFFIX "")
endif ()
set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles
# include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC")
endif ()
if (${CORE} STREQUAL "PPC440FP2")
set(STATIC_ALLOCATION 1)
endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
set(NO_AFFINITY 1)
endif ()
if (NOT ${ARCH} STREQUAL "x86_64" AND NOT ${ARCH} STREQUAL "x86" AND NOT ${CORE} STREQUAL "LOONGSON3B")
set(NO_AFFINITY 1)
endif ()
if (NO_AFFINITY)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AFFINITY")
endif ()
if (FUNCTION_PROFILE)
set(CCOMMON_OPT "${CCOMMON_OPT} -DFUNCTION_PROFILE")
endif ()
if (HUGETLB_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_HUGETLB")
endif ()
if (DEFINED HUGETLBFILE_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=${HUGETLBFILE_ALLOCATION})")
endif ()
if (STATIC_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_STATIC")
endif ()
if (DEVICEDRIVER_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\"")
endif ()
if (MIXED_MEMORY_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "SunOS")
set(TAR gtar)
set(PATCH gpatch)
set(GREP ggrep)
else ()
set(TAR tar)
set(PATCH patch)
set(GREP grep)
endif ()
if (NOT DEFINED MD5SUM)
set(MD5SUM md5sum)
endif ()
set(AWK awk)
set(REVISION "-r${OpenBLAS_VERSION}")
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
if (DEBUG)
set(COMMON_OPT "${COMMON_OPT} -g")
endif ()
if (NOT DEFINED COMMON_OPT)
set(COMMON_OPT "-O2")
endif ()
#For x86 32-bit
if (DEFINED BINARY AND BINARY EQUAL 32)
if (NOT MSVC)
set(COMMON_OPT "${COMMON_OPT} -m32")
endif()
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}")
if(NOT MSVC)
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}")
endif()
# TODO: not sure what PFLAGS is -hpa
set(PFLAGS "${PFLAGS} ${COMMON_OPT} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${COMMON_OPT} ${FCOMMON_OPT}")
# TODO: not sure what FPFLAGS is -hpa
set(FPFLAGS "${FPFLAGS} ${COMMON_OPT} ${FCOMMON_OPT} ${COMMON_PROF}")
#For LAPACK Fortran codes.
set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}")
set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}")
#Disable -fopenmp for LAPACK Fortran codes on Windows.
if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parralel")
foreach (FILTER_FLAG ${FILTER_FLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
endforeach ()
endif ()
if ("${F_COMPILER}" STREQUAL "GFORTRAN")
# lapack-netlib is rife with uninitialized warnings -hpa
set(LAPACK_FFLAGS "${LAPACK_FFLAGS} -Wno-maybe-uninitialized")
endif ()
set(LAPACK_CFLAGS "${CMAKE_C_CFLAGS} -DHAVE_LAPACK_CONFIG_H")
if (INTERFACE64)
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_ILP64")
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DOPENBLAS_OS_WINDOWS")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE")
endif ()
if (NOT DEFINED SUFFIX)
set(SUFFIX o)
endif ()
if (NOT DEFINED PSUFFIX)
set(PSUFFIX po)
endif ()
if (NOT DEFINED LIBSUFFIX)
set(LIBSUFFIX a)
endif ()
if (DYNAMIC_ARCH)
if (DEFINED SMP)
set(LIBNAME "${LIBPREFIX}p${REVISION}.${LIBSUFFIX}")
set(LIBNAME_P "${LIBPREFIX}p${REVISION}_p.${LIBSUFFIX}")
else ()
set(LIBNAME "${LIBPREFIX}${REVISION}.${LIBSUFFIX}")
set(LIBNAME_P "${LIBPREFIX}${REVISION}_p.${LIBSUFFIX}")
endif ()
else ()
if (DEFINED SMP)
set(LIBNAME "${LIBPREFIX}_${LIBCORE}p${REVISION}.${LIBSUFFIX}")
set(LIBNAME_P "${LIBPREFIX}_${LIBCORE}p${REVISION}_p.${LIBSUFFIX}")
else ()
set(LIBNAME "${LIBPREFIX}_${LIBCORE}${REVISION}.${LIBSUFFIX}")
set(LIBNAME_P "${LIBPREFIX}_${LIBCORE}${REVISION}_p.${LIBSUFFIX}")
endif ()
endif ()
set(LIBDLLNAME "${LIBPREFIX}.dll")
set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so")
set(LIBDYNNAME "${LIBNAME}.${LIBSUFFIX}.dylib")
set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def")
set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp")
set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip")
set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}")
set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}")
set(LIB_COMPONENTS BLAS)
if (NOT NO_CBLAS)
set(LIB_COMPONENTS "${LIB_COMPONENTS} CBLAS")
endif ()
if (NOT NO_LAPACK)
set(LIB_COMPONENTS "${LIB_COMPONENTS} LAPACK")
if (NOT NO_LAPACKE)
set(LIB_COMPONENTS "${LIB_COMPONENTS} LAPACKE")
endif ()
endif ()
if (ONLY_CBLAS)
set(LIB_COMPONENTS CBLAS)
endif ()
# For GEMM3M
set(USE_GEMM3M 0)
if (DEFINED ARCH)
if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64" OR ${ARCH} STREQUAL "ia64" OR ${ARCH} STREQUAL "MIPS")
set(USE_GEMM3M 1)
endif ()
if (${CORE} STREQUAL "generic")
set(USE_GEMM3M 0)
endif ()
endif ()
#export OSNAME
#export ARCH
#export CORE
#export LIBCORE
#export PGCPATH
#export CONFIG
#export CC
#export FC
#export BU
#export FU
#export NEED2UNDERSCORES
#export USE_THREAD
#export NUM_THREADS
#export NUM_CORES
#export SMP
#export MAKEFILE_RULE
#export NEED_PIC
#export BINARY
#export BINARY32
#export BINARY64
#export F_COMPILER
#export C_COMPILER
#export USE_OPENMP
#export CROSS
#export CROSS_SUFFIX
#export NOFORTRAN
#export NO_FBLAS
#export EXTRALIB
#export CEXTRALIB
#export FEXTRALIB
#export HAVE_SSE
#export HAVE_SSE2
#export HAVE_SSE3
#export HAVE_SSSE3
#export HAVE_SSE4_1
#export HAVE_SSE4_2
#export HAVE_SSE4A
#export HAVE_SSE5
#export HAVE_AVX
#export HAVE_VFP
#export HAVE_VFPV3
#export HAVE_VFPV4
#export HAVE_NEON
#export KERNELDIR
#export FUNCTION_PROFILE
#export TARGET_CORE
#
#export SGEMM_UNROLL_M
#export SGEMM_UNROLL_N
#export DGEMM_UNROLL_M
#export DGEMM_UNROLL_N
#export QGEMM_UNROLL_M
#export QGEMM_UNROLL_N
#export CGEMM_UNROLL_M
#export CGEMM_UNROLL_N
#export ZGEMM_UNROLL_M
#export ZGEMM_UNROLL_N
#export XGEMM_UNROLL_M
#export XGEMM_UNROLL_N
#export CGEMM3M_UNROLL_M
#export CGEMM3M_UNROLL_N
#export ZGEMM3M_UNROLL_M
#export ZGEMM3M_UNROLL_N
#export XGEMM3M_UNROLL_M
#export XGEMM3M_UNROLL_N
#if (USE_CUDA)
# export CUDADIR
# export CUCC
# export CUFLAGS
# export CULIB
#endif
#.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f
#
#.f.$(SUFFIX):
# $(FC) $(FFLAGS) -c $< -o $(@F)
#
#.f.$(PSUFFIX):
# $(FC) $(FPFLAGS) -pg -c $< -o $(@F)
# these are not cross-platform
#ifdef BINARY64
#PATHSCALEPATH = /opt/pathscale/lib/3.1
#PGIPATH = /opt/pgi/linux86-64/7.1-5/lib
#else
#PATHSCALEPATH = /opt/pathscale/lib/3.1/32
#PGIPATH = /opt/pgi/linux86/7.1-5/lib
#endif
#ACMLPATH = /opt/acml/4.3.0
#ifneq ($(OSNAME), Darwin)
#MKLPATH = /opt/intel/mkl/10.2.2.025/lib
#else
#MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib
#endif
#ATLASPATH = /opt/atlas/3.9.17/opteron
#FLAMEPATH = $(HOME)/flame/lib
#ifneq ($(OSNAME), SunOS)
#SUNPATH = /opt/sunstudio12.1
#else
#SUNPATH = /opt/SUNWspro
#endif

346
cmake/utils.cmake Normal file
View File

@@ -0,0 +1,346 @@
# Functions to help with the OpenBLAS build
# Reads string from getarch into CMake vars. Format of getarch vars is VARNAME=VALUE
function(ParseGetArchVars GETARCH_IN)
string(REGEX MATCHALL "[0-9_a-zA-Z]+=[0-9_a-zA-Z]+" GETARCH_RESULT_LIST "${GETARCH_IN}")
foreach (GETARCH_LINE ${GETARCH_RESULT_LIST})
# split the line into var and value, then assign the value to a CMake var
string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}")
list(GET SPLIT_VAR 0 VAR_NAME)
list(GET SPLIT_VAR 1 VAR_VALUE)
set(${VAR_NAME} ${VAR_VALUE} PARENT_SCOPE)
endforeach ()
endfunction ()
# Reads a Makefile into CMake vars.
macro(ParseMakefileVars MAKEFILE_IN)
message(STATUS "Reading vars from ${MAKEFILE_IN}...")
file(STRINGS ${MAKEFILE_IN} makefile_contents)
foreach (makefile_line ${makefile_contents})
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
set(var_name ${CMAKE_MATCH_1})
set(var_value ${CMAKE_MATCH_2})
# check for Makefile variables in the string, e.g. $(TSUFFIX)
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
foreach (make_var ${make_var_matches})
# strip out Makefile $() markup
string(REGEX REPLACE "\\$\\(([0-9_a-zA-Z]+)\\)" "\\1" make_var ${make_var})
# now replace the instance of the Makefile variable with the value of the CMake variable (note the double quote)
string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value})
endforeach ()
set(${var_name} ${var_value})
else ()
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
endif ()
endif ()
endforeach ()
endmacro ()
# Returns all combinations of the input list, as a list with colon-separated combinations
# E.g. input of A B C returns A B C A:B A:C B:C
# N.B. The input is meant to be a list, and to past a list to a function in CMake you must quote it (e.g. AllCombinations("${LIST_VAR}")).
# #param absent_codes codes to use when an element is absent from a combination. For example, if you have TRANS;UNIT;UPPER you may want the code to be NNL when nothing is present.
# @returns LIST_OUT a list of combinations
# CODES_OUT a list of codes corresponding to each combination, with N meaning the item is not present, and the first letter of the list item meaning it is presen
function(AllCombinations list_in absent_codes_in)
list(LENGTH list_in list_count)
set(num_combos 1)
# subtract 1 since we will iterate from 0 to num_combos
math(EXPR num_combos "(${num_combos} << ${list_count}) - 1")
set(LIST_OUT "")
set(CODES_OUT "")
foreach (c RANGE 0 ${num_combos})
set(current_combo "")
set(current_code "")
# this is a little ridiculous just to iterate through a list w/ indices
math(EXPR last_list_index "${list_count} - 1")
foreach (list_index RANGE 0 ${last_list_index})
math(EXPR bit "1 << ${list_index}")
math(EXPR combo_has_bit "${c} & ${bit}")
list(GET list_in ${list_index} list_elem)
if (combo_has_bit)
if (current_combo)
set(current_combo "${current_combo}:${list_elem}")
else ()
set(current_combo ${list_elem})
endif ()
string(SUBSTRING ${list_elem} 0 1 code_char)
else ()
list(GET absent_codes_in ${list_index} code_char)
endif ()
set(current_code "${current_code}${code_char}")
endforeach ()
if (current_combo STREQUAL "")
list(APPEND LIST_OUT " ") # Empty set is a valid combination, but CMake isn't appending the empty string for some reason, use a space
else ()
list(APPEND LIST_OUT ${current_combo})
endif ()
list(APPEND CODES_OUT ${current_code})
endforeach ()
set(LIST_OUT ${LIST_OUT} PARENT_SCOPE)
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
endfunction ()
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
# @param sources_in the source files to build from
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
# e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax"
# @param replace_last_with replaces the last character in the filename with this string (e.g. symm_k should be symm_TU)
# @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters)
# @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc)
# @param complex_filename_scheme some routines have separate source files for complex and non-complex float types.
# 0 - compiles for all types
# 1 - compiles the sources for non-complex types only (SINGLE/DOUBLE)
# 2 - compiles for complex types only (COMPLEX/DOUBLE COMPLEX)
# 3 - compiles for all types, but changes source names for complex by prepending z (e.g. axpy.c becomes zaxpy.c)
# 4 - compiles for complex types only, but changes source names for complex by prepending z (e.g. hemv.c becomes zhemv.c)
# STRING - compiles only the given type (e.g. DOUBLE)
function(GenerateNamedObjects sources_in)
if (DEFINED ARGV1)
set(defines_in ${ARGV1})
endif ()
if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "")
set(name_in ${ARGV2})
# strip off extension for kernel files that pass in the object name.
get_filename_component(name_in ${name_in} NAME_WE)
endif ()
if (DEFINED ARGV3)
set(use_cblas ${ARGV3})
else ()
set(use_cblas false)
endif ()
if (DEFINED ARGV4)
set(replace_last_with ${ARGV4})
endif ()
if (DEFINED ARGV5)
set(append_with ${ARGV5})
endif ()
if (DEFINED ARGV6)
set(no_float_type ${ARGV6})
else ()
set(no_float_type false)
endif ()
if (no_float_type)
set(float_list "DUMMY") # still need to loop once
else ()
set(float_list "${FLOAT_TYPES}")
endif ()
set(real_only false)
set(complex_only false)
set(mangle_complex_sources false)
if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "")
if (${ARGV7} EQUAL 1)
set(real_only true)
elseif (${ARGV7} EQUAL 2)
set(complex_only true)
elseif (${ARGV7} EQUAL 3)
set(mangle_complex_sources true)
elseif (${ARGV7} EQUAL 4)
set(mangle_complex_sources true)
set(complex_only true)
elseif (NOT ${ARGV7} EQUAL 0)
set(float_list ${ARGV7})
endif ()
endif ()
if (complex_only)
list(REMOVE_ITEM float_list "SINGLE")
list(REMOVE_ITEM float_list "DOUBLE")
elseif (real_only)
list(REMOVE_ITEM float_list "COMPLEX")
list(REMOVE_ITEM float_list "ZCOMPLEX")
endif ()
set(float_char "")
set(OBJ_LIST_OUT "")
foreach (float_type ${float_list})
foreach (source_file ${sources_in})
if (NOT no_float_type)
string(SUBSTRING ${float_type} 0 1 float_char)
string(TOLOWER ${float_char} float_char)
endif ()
if (NOT name_in)
get_filename_component(source_name ${source_file} NAME_WE)
set(obj_name "${float_char}${source_name}")
else ()
# replace * with float_char
if (${name_in} MATCHES "\\*")
string(REPLACE "*" ${float_char} obj_name ${name_in})
else ()
set(obj_name "${float_char}${name_in}")
endif ()
endif ()
if (replace_last_with)
string(REGEX REPLACE ".$" ${replace_last_with} obj_name ${obj_name})
else ()
set(obj_name "${obj_name}${append_with}")
endif ()
# now add the object and set the defines
set(obj_defines ${defines_in})
if (use_cblas)
set(obj_name "cblas_${obj_name}")
list(APPEND obj_defines "CBLAS")
endif ()
list(APPEND obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CHAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"")
if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX")
list(APPEND obj_defines "DOUBLE")
endif ()
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
list(APPEND obj_defines "COMPLEX")
if (mangle_complex_sources)
# add a z to the filename
get_filename_component(source_name ${source_file} NAME)
get_filename_component(source_dir ${source_file} DIRECTORY)
string(REPLACE ${source_name} "z${source_name}" source_file ${source_file})
endif ()
endif ()
if (VERBOSE_GEN)
message(STATUS "${obj_name}:${source_file}")
message(STATUS "${obj_defines}")
endif ()
# create a copy of the source to avoid duplicate obj filename problem with ar.exe
get_filename_component(source_extension ${source_file} EXT)
set(new_source_file "${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${obj_name}${source_extension}")
if (IS_ABSOLUTE ${source_file})
set(old_source_file ${source_file})
else ()
set(old_source_file "${CMAKE_CURRENT_LIST_DIR}/${source_file}")
endif ()
string(REPLACE ";" "\n#define " define_source "${obj_defines}")
string(REPLACE "=" " " define_source "${define_source}")
file(WRITE ${new_source_file} "#define ${define_source}\n#include \"${old_source_file}\"")
list(APPEND SRC_LIST_OUT ${new_source_file})
endforeach ()
endforeach ()
list(APPEND OPENBLAS_SRC ${SRC_LIST_OUT})
set(OPENBLAS_SRC ${OPENBLAS_SRC} PARENT_SCOPE)
endfunction ()
# generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in
# @param sources_in the source files to build from
# @param defines_in the preprocessor definitions that will be combined to create the object files
# @param all_defines_in (optional) preprocessor definitions that will be applied to all objects
# @param replace_scheme If 1, replace the "k" in the filename with the define combo letters. E.g. symm_k.c with TRANS and UNIT defined will be symm_TU.
# If 0, it will simply append the code, e.g. symm_L.c with TRANS and UNIT will be symm_LTU.
# If 2, it will append the code with an underscore, e.g. symm.c with TRANS and UNIT will be symm_TU.
# If 3, it will insert the code *around* the last character with an underscore, e.g. symm_L.c with TRANS and UNIT will be symm_TLU (required by BLAS level2 objects).
# If 4, it will insert the code before the last underscore. E.g. trtri_U_parallel with TRANS will be trtri_UT_parallel
# @param alternate_name replaces the source name as the object name (define codes are still appended)
# @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc)
# @param complex_filename_scheme see GenerateNamedObjects
function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme)
set(alternate_name_in "")
if (DEFINED ARGV5)
set(alternate_name_in ${ARGV5})
endif ()
set(no_float_type false)
if (DEFINED ARGV6)
set(no_float_type ${ARGV6})
endif ()
set(complex_filename_scheme "")
if (DEFINED ARGV7)
set(complex_filename_scheme ${ARGV7})
endif ()
AllCombinations("${defines_in}" "${absent_codes_in}")
set(define_combos ${LIST_OUT})
set(define_codes ${CODES_OUT})
list(LENGTH define_combos num_combos)
math(EXPR num_combos "${num_combos} - 1")
foreach (c RANGE 0 ${num_combos})
list(GET define_combos ${c} define_combo)
list(GET define_codes ${c} define_code)
foreach (source_file ${sources_in})
set(alternate_name ${alternate_name_in})
# replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with
string(REPLACE ":" ";" define_combo ${define_combo})
# now add the object and set the defines
set(cur_defines ${define_combo})
if ("${cur_defines}" STREQUAL " ")
set(cur_defines ${all_defines_in})
else ()
list(APPEND cur_defines ${all_defines_in})
endif ()
set(replace_code "")
set(append_code "")
if (replace_scheme EQUAL 1)
set(replace_code ${define_code})
else ()
if (replace_scheme EQUAL 2)
set(append_code "_${define_code}")
elseif (replace_scheme EQUAL 3)
if ("${alternate_name}" STREQUAL "")
string(REGEX MATCH "[a-zA-Z]\\." last_letter ${source_file})
else ()
string(REGEX MATCH "[a-zA-Z]$" last_letter ${alternate_name})
endif ()
# first extract the last letter
string(SUBSTRING ${last_letter} 0 1 last_letter) # remove period from match
# break the code up into the first letter and the remaining (should only be 2 anyway)
string(SUBSTRING ${define_code} 0 1 define_code_first)
string(SUBSTRING ${define_code} 1 -1 define_code_second)
set(replace_code "${define_code_first}${last_letter}${define_code_second}")
elseif (replace_scheme EQUAL 4)
# insert code before the last underscore and pass that in as the alternate_name
if ("${alternate_name}" STREQUAL "")
get_filename_component(alternate_name ${source_file} NAME_WE)
endif ()
set(extra_underscore "")
# check if filename has two underscores, insert another if not (e.g. getrs_parallel needs to become getrs_U_parallel not getrsU_parallel)
string(REGEX MATCH "_[a-zA-Z]+_" underscores ${alternate_name})
string(LENGTH "${underscores}" underscores)
if (underscores EQUAL 0)
set(extra_underscore "_")
endif ()
string(REGEX REPLACE "(.+)(_[^_]+)$" "\\1${extra_underscore}${define_code}\\2" alternate_name ${alternate_name})
else()
set(append_code ${define_code}) # replace_scheme should be 0
endif ()
endif ()
GenerateNamedObjects("${source_file}" "${cur_defines}" "${alternate_name}" false "${replace_code}" "${append_code}" "${no_float_type}" "${complex_filename_scheme}")
endforeach ()
endforeach ()
set(OPENBLAS_SRC ${OPENBLAS_SRC} PARENT_SCOPE)
endfunction ()

122
common.h
View File

@@ -82,7 +82,11 @@ extern "C" {
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if !defined(_MSC_VER)
#include <unistd.h>
#endif
#include <time.h>
#ifdef OS_LINUX
#include <malloc.h>
@@ -93,6 +97,14 @@ extern "C" {
#include <sched.h>
#endif
#ifdef OS_ANDROID
#define NO_SYSV_IPC
//Android NDK only supports complex.h since Android 5.0
#if __ANDROID_API__ < 21
#define FORCE_OPENBLAS_COMPLEX_STRUCT
#endif
#endif
#ifdef OS_WINDOWS
#ifdef ATOM
#define GOTO_ATOM ATOM
@@ -106,8 +118,11 @@ extern "C" {
#endif
#else
#include <sys/mman.h>
#ifndef NO_SYSV_IPC
#include <sys/shm.h>
#endif
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
#include <math.h>
#ifdef SMP
@@ -287,13 +302,6 @@ typedef int blasint;
#define COMPSIZE 2
#endif
#if defined(C_PGI) || defined(C_SUN)
#define CREAL(X) (*((FLOAT *)&X + 0))
#define CIMAG(X) (*((FLOAT *)&X + 1))
#else
#define CREAL __real__
#define CIMAG __imag__
#endif
#define Address_H(x) (((x)+(1<<15))>>16)
#define Address_L(x) ((x)-((Address_H(x))<<16))
@@ -307,8 +315,12 @@ typedef int blasint;
#endif
#if defined(OS_WINDOWS)
#if defined(_MSC_VER) && !defined(__clang__)
#define YIELDING YieldProcessor()
#else
#define YIELDING SwitchToThread()
#endif
#endif
#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5)
#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
@@ -327,6 +339,14 @@ typedef int blasint;
#endif
#endif
/*
#ifdef STEAMROLLER
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
*/
#ifndef YIELDING
#define YIELDING sched_yield()
#endif
@@ -396,7 +416,51 @@ typedef char env_var_t[MAX_PATH];
typedef char* env_var_t;
#define readenv(p, n) ((p)=getenv(n))
#endif
#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
#ifdef _POSIX_MONOTONIC_CLOCK
#if defined(__GLIBC_PREREQ) // cut the if condition if two lines, otherwise will fail at __GLIBC_PREREQ(2, 17)
#if __GLIBC_PREREQ(2, 17) // don't require -lrt
#define USE_MONOTONIC
#endif
#elif defined(OS_ANDROID)
#define USE_MONOTONIC
#endif
#endif
/* use similar scale as x86 rdtsc for timeouts to work correctly */
static inline unsigned long long rpcc(void){
#ifdef USE_MONOTONIC
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
#else
struct timeval tv;
gettimeofday(&tv,NULL);
return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
#endif
}
#define RPCC_DEFINED
#define RPCC64BIT
#endif // !RPCC_DEFINED
#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
static void __inline blas_lock(volatile BLASULONG *address){
do {
while (*address) {YIELDING;};
} while (!__sync_bool_compare_and_swap(address, 0, 1));
}
#define BLAS_LOCK_DEFINED
#endif
#ifndef RPCC_DEFINED
#error "rpcc() implementation is missing for your platform"
#endif
#ifndef BLAS_LOCK_DEFINED
#error "blas_lock() implementation is missing for your platform"
#endif
#endif // !ASSEMBLER
#ifdef OS_LINUX
#include "common_linux.h"
@@ -442,18 +506,52 @@ typedef char* env_var_t;
/* C99 supports complex floating numbers natively, which GCC also offers as an
extension since version 3.0. If neither are available, use a compatible
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
(__GNUC__ >= 3 && !defined(__cplusplus)))
#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
(__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT)))
#define OPENBLAS_COMPLEX_C99
#ifndef __cplusplus
#include <complex.h>
#endif
typedef float _Complex openblas_complex_float;
typedef double _Complex openblas_complex_double;
typedef xdouble _Complex openblas_complex_xdouble;
#define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I))
#define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I))
#define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I))
#else
#define OPENBLAS_COMPLEX_STRUCT
typedef struct { float real, imag; } openblas_complex_float;
typedef struct { double real, imag; } openblas_complex_double;
typedef struct { xdouble real, imag; } openblas_complex_xdouble;
#define openblas_make_complex_float(real, imag) {(real), (imag)}
#define openblas_make_complex_double(real, imag) {(real), (imag)}
#define openblas_make_complex_xdouble(real, imag) {(real), (imag)}
#endif
#ifdef XDOUBLE
#define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble
#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i)
#elif defined(DOUBLE)
#define OPENBLAS_COMPLEX_FLOAT openblas_complex_double
#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_double(r,i)
#else
#define OPENBLAS_COMPLEX_FLOAT openblas_complex_float
#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_float(r,i)
#endif
#if defined(C_PGI) || defined(C_SUN)
#define CREAL(X) (*((FLOAT *)&X + 0))
#define CIMAG(X) (*((FLOAT *)&X + 1))
#else
#ifdef OPENBLAS_COMPLEX_STRUCT
#define CREAL(Z) ((Z).real)
#define CIMAG(Z) ((Z).imag)
#else
#define CREAL __real__
#define CIMAG __imag__
#endif
#endif
#endif // ASSEMBLER
#ifndef IFLUSH
@@ -470,6 +568,10 @@ typedef char* env_var_t;
#endif
#endif
#if defined(C_MSVC)
#define inline __inline
#endif
#ifndef ASSEMBLER
#ifndef MIN
@@ -491,6 +593,8 @@ void blas_set_parameter(void);
int blas_get_cpu_number(void);
void *blas_memory_alloc (int);
void blas_memory_free (void *);
void *blas_memory_alloc_nolock (int); //use malloc without blas_lock
void blas_memory_free_nolock (void *);
int get_num_procs (void);

View File

@@ -76,6 +76,7 @@ static void __inline blas_lock(unsigned long *address){
"30:", address);
#endif
}
#define BLAS_LOCK_DEFINED
static __inline unsigned int rpcc(void){
@@ -89,6 +90,7 @@ static __inline unsigned int rpcc(void){
return r0;
}
#define RPCC_DEFINED
#define HALT ldq $0, 0($0)

View File

@@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011-2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -13,9 +13,10 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -27,59 +28,31 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#ifndef COMMON_ARM
#define COMMON_ARM
#if defined(ARMV5) || defined(ARMV6)
#define MB
#define WMB
#else
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
#endif
#define INLINE inline
#define RETURN_BY_COMPLEX
#ifndef ASSEMBLER
#if defined(ARMV6) || defined(ARMV7) || defined(ARMV8)
static void __inline blas_lock(volatile BLASULONG *address){
int register ret;
@@ -88,37 +61,29 @@ static void __inline blas_lock(volatile BLASULONG *address){
while (*address) {YIELDING;};
__asm__ __volatile__(
"ldrex r2, [%1] \n\t"
"mov r2, #0 \n\t"
"strex r3, r2, [%1] \n\t"
"mov %0 , r3 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
: "memory", "r2" , "r3"
"ldrex r2, [%1] \n\t"
"strex %0, %2, [%1] \n\t"
"orr %0, r2 \n\t"
: "=&r"(ret)
: "r"(address), "r"(1)
: "memory", "r2"
);
} while (ret);
MB;
}
static inline unsigned long long rpcc(void){
unsigned long long ret=0;
double v;
struct timeval tv;
gettimeofday(&tv,NULL);
v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
ret = (unsigned long long) ( v * 1000.0d );
return ret;
}
#define BLAS_LOCK_DEFINED
#endif
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#if defined(DOUBLE)
#if !defined(HAVE_VFP)
/* no FPU, soft float */
#define GET_IMAGE(res)
#elif defined(DOUBLE)
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
@@ -166,4 +131,8 @@ REALNAME:
#define MAP_ANONYMOUS MAP_ANON
#endif
#if !defined(ARMV5) && !defined(ARMV6) && !defined(ARMV7) && !defined(ARMV8)
#error "you must define ARMV5, ARMV6, ARMV7 or ARMV8"
#endif
#endif

View File

@@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011-2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -13,9 +13,10 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -27,52 +28,14 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#ifndef COMMON_ARM64
#define COMMON_ARM64
#define MB
#define WMB
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
#define INLINE inline
@@ -80,48 +43,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef ASSEMBLER
static void __inline blas_lock(volatile BLASULONG *address){
/*
int register ret;
BLASULONG ret;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"ldrex r2, [%1] \n\t"
"mov r2, #0 \n\t"
"strex r3, r2, [%1] \n\t"
"mov %0 , r3 \n\t"
"mov x4, #1 \n\t"
"1: \n\t"
"ldaxr x2, [%1] \n\t"
"cbnz x2, 1b \n\t"
"2: \n\t"
"stxr w3, x4, [%1] \n\t"
"cbnz w3, 1b \n\t"
"mov %0, #0 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
: "memory", "r2" , "r3"
: "memory", "x2" , "x3", "x4"
);
} while (ret);
*/
}
#define BLAS_LOCK_DEFINED
static inline unsigned long long rpcc(void){
unsigned long long ret=0;
double v;
struct timeval tv;
gettimeofday(&tv,NULL);
v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
ret = (unsigned long long) ( v * 1000.0d );
return ret;
}
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#if defined(DOUBLE)
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
#define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
#define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
#endif
#define GET_IMAGE_CANCEL
@@ -138,9 +100,10 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.arm ;\
.text ;\
.align 4 ;\
.global REALNAME ;\
.func REALNAME ;\
.type REALNAME, %function ;\
REALNAME:
#define EPILOGUE
@@ -157,7 +120,11 @@ REALNAME:
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#if defined(CORTEXA57)
#define BUFFER_SIZE (20 << 20)
#else
#define BUFFER_SIZE (16 << 20)
#endif
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
@@ -167,3 +134,4 @@ REALNAME:
#endif
#endif

View File

@@ -220,6 +220,16 @@
#define COMATCOPY_K_CTC comatcopy_k_ctc
#define COMATCOPY_K_RTC comatcopy_k_rtc
#define CIMATCOPY_K_CN cimatcopy_k_cn
#define CIMATCOPY_K_RN cimatcopy_k_rn
#define CIMATCOPY_K_CT cimatcopy_k_ct
#define CIMATCOPY_K_RT cimatcopy_k_rt
#define CIMATCOPY_K_CNC cimatcopy_k_cnc
#define CIMATCOPY_K_RNC cimatcopy_k_rnc
#define CIMATCOPY_K_CTC cimatcopy_k_ctc
#define CIMATCOPY_K_RTC cimatcopy_k_rtc
#define CGEADD_K cgeadd_k
#else
@@ -403,6 +413,17 @@
#define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc
#define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc
#define CIMATCOPY_K_CN gotoblas -> cimatcopy_k_cn
#define CIMATCOPY_K_RN gotoblas -> cimatcopy_k_rn
#define CIMATCOPY_K_CT gotoblas -> cimatcopy_k_ct
#define CIMATCOPY_K_RT gotoblas -> cimatcopy_k_rt
#define CIMATCOPY_K_CNC gotoblas -> cimatcopy_k_cnc
#define CIMATCOPY_K_RNC gotoblas -> cimatcopy_k_rnc
#define CIMATCOPY_K_CTC gotoblas -> cimatcopy_k_ctc
#define CIMATCOPY_K_RTC gotoblas -> cimatcopy_k_rtc
#define CGEADD_K gotoblas -> cgeadd_k
#endif
#define CGEMM_NN cgemm_nn

View File

@@ -150,6 +150,12 @@
#define DOMATCOPY_K_CT domatcopy_k_ct
#define DOMATCOPY_K_RT domatcopy_k_rt
#define DIMATCOPY_K_CN dimatcopy_k_cn
#define DIMATCOPY_K_RN dimatcopy_k_rn
#define DIMATCOPY_K_CT dimatcopy_k_ct
#define DIMATCOPY_K_RT dimatcopy_k_rt
#define DGEADD_K dgeadd_k
#else
#define DAMAX_K gotoblas -> damax_k
@@ -266,6 +272,12 @@
#define DOMATCOPY_K_RN gotoblas -> domatcopy_k_rn
#define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct
#define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt
#define DIMATCOPY_K_CN gotoblas -> dimatcopy_k_cn
#define DIMATCOPY_K_RN gotoblas -> dimatcopy_k_rn
#define DIMATCOPY_K_CT gotoblas -> dimatcopy_k_ct
#define DIMATCOPY_K_RT gotoblas -> dimatcopy_k_rt
#define DGEADD_K gotoblas -> dgeadd_k
#endif

View File

@@ -68,6 +68,7 @@ static __inline void blas_lock(volatile unsigned long *address){
: "ar.ccv", "memory");
} while (ret);
}
#define BLAS_LOCK_DEFINED
static __inline unsigned long rpcc(void) {
unsigned long clocks;
@@ -75,6 +76,7 @@ static __inline unsigned long rpcc(void) {
__asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks));
return clocks;
}
#define RPCC_DEFINED
static __inline unsigned long stmxcsr(void){
@@ -99,10 +101,12 @@ static __inline void blas_lock(volatile unsigned long *address){
while (*address || _InterlockedCompareExchange((volatile int *) address,1,0))
;
}
#define BLAS_LOCK_DEFINED
static __inline unsigned int rpcc(void) {
return __getReg(_IA64_REG_AR_ITC);
}
#define RPCC_DEFINED
static __inline unsigned int stmxcsr(void) {
return __getReg(_IA64_REG_AR_FPSR);

View File

@@ -754,6 +754,12 @@ void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, do
void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *);
void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *);
void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*);
void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*);
void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*);
void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*);
#ifdef __cplusplus
}

View File

@@ -47,12 +47,12 @@ double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG);
double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG);
xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
float _Complex cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float _Complex cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG);
double _Complex zdotc_k (BLASLONG, double *, BLASLONG, double *, BLASLONG);
double _Complex zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG);
xdouble _Complex xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
xdouble _Complex xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_float cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_double zdotc_k (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_xdouble xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int saxpy_k (BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);

View File

@@ -1736,31 +1736,60 @@ int somatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLAS
int somatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG);
int somatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG);
int somatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG);
int simatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG);
int simatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG);
int simatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG);
int simatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG);
int domatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG);
int domatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG);
int domatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG);
int domatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG);
int dimatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG);
int dimatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG);
int dimatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG);
int dimatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG);
int comatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
int comatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
int comatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
int comatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
int cimatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG);
int cimatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG);
int cimatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG);
int cimatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG);
int comatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
int comatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
int comatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
int comatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
int cimatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG);
int cimatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG);
int cimatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG);
int cimatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG);
int zomatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
int zomatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
int zomatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
int zomatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
int zimatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG);
int zimatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG);
int zimatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG);
int zimatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG);
int zomatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
int zimatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG);
int zimatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG);
int zimatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG);
int zimatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG);
int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG);
int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG);
int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG);
int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG);
#ifdef __CUDACC__

Some files were not shown because too many files have changed in this diff Show More