Compare commits

..

1156 Commits

Author SHA1 Message Date
Zhang Xianyi
12ab1804b6 Merge branch 'develop' 2016-04-12 15:29:19 -04:00
Zhang Xianyi
1e03a62b67 Update doc for 0.2.18 version. 2016-04-12 15:28:31 -04:00
Zhang Xianyi
faa73690e4 Delete LOCAL_BUFFER_SIZE for other architectures. 2016-04-12 11:49:28 -04:00
Zhang Xianyi
f24d5307cf Refs #834. Fix zgemv config bug on Steamroller. 2016-04-12 22:26:11 +08:00
wernsaar
0a4276bc2f Merge pull request #837 from wernsaar/develop
updated zgemm- and ztrmm-kernel for POWER8
2016-04-08 11:13:27 +02:00
Werner Saar
08bddde3f3 updated benchmark Makefile for ESSL 2016-04-08 10:37:59 +02:00
Werner Saar
e173c51c04 updated zgemm- and ztrmm-kernel for POWER8 2016-04-08 09:05:37 +02:00
Werner Saar
9c42f0374a Updated cgemm- and sgemm-kernel for POWER8 SMP 2016-04-07 15:08:15 +02:00
Zhang Xianyi
d4380c1fe4 Refs xianyi/OpenBLAS-CI#10 , Fix sdot for scipy test_iterative.test_convergence test failure on AMD bulldozer and piledriver. 2016-04-07 01:44:18 +08:00
Werner Saar
a51102e9b7 bugfixes for sgemm- and cgemm-kernel 2016-04-06 11:15:21 +02:00
wernsaar
7282419525 Merge pull request #833 from wernsaar/develop
updated optimized cgemm- and ctrmm-kernel for POWER8
2016-04-04 12:29:51 +02:00
Werner Saar
c5b1fbcb2e updated optimized cgemm- and ctrmm-kernel for POWER8 2016-04-04 09:12:08 +02:00
wernsaar
e1cdd15b30 Merge pull request #832 from wernsaar/develop
updated cgemm- and ctrmm-kernel for POWER8
2016-04-03 15:05:25 +02:00
Werner Saar
d4c0330967 updated cgemm- and ctrmm-kernel for POWER8 2016-04-03 14:30:49 +02:00
Werner Saar
12540cedb5 added ESSL to Makefile for benchmarks 2016-04-03 07:21:48 +02:00
wernsaar
99adc8b062 Merge pull request #831 from wernsaar/develop
updated sgemm- and strmm-kernel for POWER8
2016-04-02 18:05:44 +02:00
Werner Saar
6a9bbfc227 updated sgemm- and strmm-kernel for POWER8 2016-04-02 17:16:36 +02:00
Zhang Xianyi
3349e9debd Merge pull request #830 from eschnett/patch-1
Correct small typo in comment
2016-04-01 17:35:22 -04:00
Erik Schnetter
dd7612358d Correct small typo in comment 2016-04-01 13:49:33 -04:00
Zhang Xianyi
e5a6ef3808 Merge pull request #829 from jeromerobert/bug828
Allow to force to do not use -j as make argument
2016-03-31 21:59:40 -04:00
Jerome Robert
7aac0aff8e Allow to force to do not use -j as make argument
Close #828 (hopefully)
2016-03-31 23:03:52 +02:00
wernsaar
26d7f06206 Merge pull request #827 from wernsaar/develop
added optimized dgemv_n kernel for POWER8
2016-03-30 12:04:49 +02:00
Werner Saar
68a69c5b50 added optimized dgemv_n kernel for POWER8 2016-03-30 11:10:53 +02:00
wernsaar
a571359afd Merge pull request #826 from wernsaar/develop
added optimized asum kernels for POWER8
2016-03-28 15:09:52 +02:00
Werner Saar
c2464a7c4a added optimized casum kernel for POWER8 2016-03-28 14:12:08 +02:00
Werner Saar
294f933869 added optimized zasum kernel for POWER8 2016-03-28 13:37:32 +02:00
Werner Saar
f59c9bd6ef added optimized sasum kernel for POWER8 2016-03-28 12:44:25 +02:00
Werner Saar
c53be46d78 added optimized dasum kernel for POWER8 2016-03-28 12:17:15 +02:00
wernsaar
bbb2d73d73 Merge pull request #825 from wernsaar/develop
added optimized cswap and zswap kernel for POWER8
2016-03-27 19:04:06 +02:00
Werner Saar
659ed16591 added otimized cswap and zswap kernels for POWER8 2016-03-27 18:31:37 +02:00
Werner Saar
35c98a3556 added optimized zscal kernel for POWER8 2016-03-27 16:31:50 +02:00
Werner Saar
f1a5dd06c5 added optimized sscal kernel for POWER8 2016-03-27 11:05:56 +02:00
wernsaar
e125a3dc33 Merge pull request #824 from wernsaar/develop
added optimized drot-kernel and srot-kernel for POWER8
2016-03-27 10:43:17 +02:00
Werner Saar
35f1f21a7f added drot- and srot-kernel optimimized for POWER8 2016-03-27 08:57:11 +02:00
Zhang Xianyi
7b4b7179ba Merge pull request #819 from ashwinyes/develop_20160324_fixes_optimizations
Cortex-A57: Fixes and Optimizations
2016-03-27 00:04:20 -04:00
Werner Saar
7a92c1538e added benchmark test for srot and drot 2016-03-26 07:14:13 +01:00
wernsaar
5727268141 Merge pull request #823 from wernsaar/develop
added optimized copy and swap kernels for POWER8
2016-03-25 18:08:48 +01:00
Werner Saar
3d9a50e841 added optimized sswap kernel for POWER8 2016-03-25 17:34:55 +01:00
Werner Saar
828c849b44 added optimized ccopy kernel for POWER8 2016-03-25 16:54:25 +01:00
Werner Saar
ecc0bc9813 added optimized scopy kernel for POWER8 2016-03-25 16:06:56 +01:00
Werner Saar
12f209b7b0 added optimized zswap kernel for POWER8 2016-03-25 15:27:34 +01:00
Werner Saar
7316a87930 added optimized dswap kernel for POWER8 2016-03-25 14:35:43 +01:00
Werner Saar
0bff057a87 added optimized dcopy kernel for POWER8 2016-03-25 13:03:02 +01:00
wernsaar
7ee1d29dd4 Merge pull request #822 from wernsaar/develop
added optimized dscal kernel for POWER8
2016-03-25 10:15:51 +01:00
Werner Saar
1e6cf9808c added optimized dscal kernel for POWER8 2016-03-25 09:42:08 +01:00
Ashwin Sekhar T K
278511ad2d Cortex-A57: Fix clang compilation errors 2016-03-24 10:42:04 +05:30
Ashwin Sekhar T K
3b5ffb49d3 Cortex-A57: Improve DGEMM 8x4 Implementation 2016-03-24 10:25:18 +05:30
wernsaar
8519e4ed9f Merge pull request #817 from wernsaar/develop
added optimized zaxpy kernel for POWER8
2016-03-23 13:37:04 +01:00
Werner Saar
55eda3813b added optimized zaxpy kernel for POWER8 2016-03-23 11:20:23 +01:00
Zhang Xianyi
53bfc83c26 Update appveyor version. 2016-03-22 11:37:35 -04:00
Zhang Xianyi
13ca89f6f0 Merge pull request #813 from theoractice/develop
Fix access violation on Windows while static linking in MSVC
2016-03-22 11:31:37 -04:00
wernsaar
461cf9ea38 Merge pull request #814 from wernsaar/develop
added optimized daxpy kernel for POWER8
2016-03-22 15:24:59 +01:00
Werner Saar
0664ba4c97 added optimized daxpy kernel for POWER8 2016-03-22 14:50:03 +01:00
Theoractice
aa744dfa59 Update memory.c 2016-03-22 20:02:37 +08:00
theoractice
61cf8f74d9 Fix access violation on Windows while static linking 2016-03-22 19:14:54 +08:00
Theoractice
de202fa375 Merge pull request #1 from xianyi/develop
upd
2016-03-22 05:33:20 -05:00
wernsaar
6f93b53590 Merge pull request #812 from wernsaar/develop
added optimized sdot kernel for POWER8
2016-03-21 13:59:44 +01:00
Werner Saar
11c44dede1 added optimized sdot kernel for POWER8 2016-03-21 13:18:23 +01:00
wernsaar
f00d642592 Merge pull request #811 from wernsaar/develop
added optimized zdot kernel for POWER8
2016-03-21 10:48:41 +01:00
Werner Saar
9e4584d069 added optimized zdot kernel for POWER8 2016-03-21 10:12:07 +01:00
Zhang Xianyi
2a5679da5f Merge branch 'release-0.2.17' into develop 2016-03-20 20:52:43 -04:00
Zhang Xianyi
a71e8c82f6 Fix change log typo. 2016-03-20 20:52:15 -04:00
Zhang Xianyi
9b987badb0 Merge branch 'master' into develop
Bump to 0.2.18.dev

Conflicts:
	CMakeLists.txt
	Makefile.rule
2016-03-20 20:48:21 -04:00
Zhang Xianyi
1619b2f3c8 Merge branch 'release-0.2.17' 2016-03-20 20:44:01 -04:00
Zhang Xianyi
4f3153395a Update doc for 0.2.17. 2016-03-20 20:43:42 -04:00
Zhang Xianyi
d7a1a7ff2a Merge branch 'release-0.2.17' into develop 2016-03-20 09:24:28 -04:00
Zhang Xianyi
308e6195b7 Refs #807. Enable BUILD_LAPACK_DEPRECATED=1 by default. 2016-03-20 09:22:56 -04:00
Zhang Xianyi
7a3d7b1f52 Merge pull request #808 from theoractice/develop
Fix a minor compiler error in VisualStudio with CMake
2016-03-20 09:07:47 -04:00
wernsaar
74cc2d6623 Merge pull request #809 from wernsaar/develop
Ref #795: added optimized ddot kernel for POWER8
2016-03-20 13:16:41 +01:00
theoractice
fc3a558515 Fix a minor compiler error in VisualStudio with CMake 2016-03-20 18:58:18 +08:00
Werner Saar
cd9fafc054 ddot for POWER8: updated licence information 2016-03-20 11:19:27 +01:00
Werner Saar
84b92e6373 added optimized ddot kernel for POWER8 2016-03-20 11:06:06 +01:00
wernsaar
c279a53ed8 Merge pull request #806 from wernsaar/develop
adding optimized single precision blas level3 kernels for POWER8
2016-03-18 12:46:16 +01:00
Werner Saar
e1df5a6e23 fixed sgemm- and strmm-kernel 2016-03-18 12:12:03 +01:00
Werner Saar
5c658f8746 add optimized cgemm- and ctrmm-kernel for POWER8 2016-03-18 08:17:25 +01:00
Zhang Xianyi
ec4390a967 Bump devlop version to 0.2.17.dev. 2016-03-15 14:52:01 -04:00
Zhang Xianyi
fced5744fb Merge branch 'release-0.2.16' 2016-03-15 14:49:10 -04:00
Zhang Xianyi
8c0fb1258d Update 0.2.16 doc 2016-03-15 14:48:41 -04:00
Zhang Xianyi
aae581d004 Merge branch 'develop' into release-0.2.16 2016-03-15 13:56:01 -04:00
Zhang Xianyi
e17303933a Merge pull request #802 from ashwinyes/develop_20160314_dgemm_optimization
DGEMM Optimizations for Cortex-A57
2016-03-14 20:31:03 -04:00
Zhang Xianyi
f9226275f4 Merge pull request #801 from Keno/patch-3
Don't pass REALNAME to `.end`
2016-03-14 15:42:31 -04:00
Ashwin Sekhar T K
cf8c7e28b3 Update CONTRIBUTORS.md 2016-03-14 20:01:02 +05:30
Ashwin Sekhar T K
5ac02f6dc7 Optimize Dgemm 4x4 for Cortex A57 2016-03-14 19:35:23 +05:30
Ashwin Sekhar T K
7aa1ad4923 Functional Assembly Kernels for CortexA57
Adding functional (non-optimized) kernels for Cortex-A57
with the following layouts.
SGEMM - 16x4, 8x8
CGEMM - 8x4
DGEMM - 8x4, 4x8
2016-03-14 19:33:21 +05:30
Werner Saar
dcd15b546c BUGFIX: KERNEL.POWER8 2016-03-14 14:36:59 +01:00
Werner Saar
96284ab295 added sgemm- and strmm-kernel for POWER8 2016-03-14 13:52:44 +01:00
Keno Fischer
d5e1255ca7 Don't pass REALNAME to .end
Putting the procedure there is an MSVC-ism, where it is optional. GCC silently ignores and Clang errors, so it is best to remove this.
2016-03-13 18:56:21 -04:00
Zhang Xianyi
587455868e Merge pull request #800 from jeromerobert/smallscaling
Fix smallscaling compilation
2016-03-10 15:45:33 -05:00
Jerome Robert
323c237e7b Fix smallscaling compilation
Also revert 0bbca5e
2016-03-10 20:24:41 +01:00
Werner Saar
faa5e2e5e3 FIX: forgot the add the files cgemv_n_4.c and cgemv_t_4.c 2016-03-10 11:10:38 +01:00
wernsaar
551fdf53e8 Merge pull request #799 from wernsaar/develop
Added optimized cgemv_n and cgemv_t kernels for bulldozer, piledriver…
2016-03-10 10:22:08 +01:00
Werner Saar
fdf291be30 Added optimized cgemv_n and cgemv_t kernels for bulldozer, piledriver and steamroller 2016-03-10 09:42:07 +01:00
Zhang Xianyi
68eb4fa329 Add missing openblas_env makefile. 2016-03-09 14:52:47 -05:00
Zhang Xianyi
05196a8497 Refs #716. Only call getenv at init function. 2016-03-09 12:50:07 -05:00
wernsaar
db9b611b12 Merge pull request #798 from wernsaar/develop
Optimized zgemv_n kernel for bulldozer, piledriver and steamroller
2016-03-09 15:55:56 +01:00
Werner Saar
2e6333f74e modified common.h for piledriver 2016-03-09 15:48:29 +01:00
Werner Saar
c99cc41cbd Added optimized zgemv_n kernel for bulldozer, piledriver and steamroller 2016-03-09 14:02:03 +01:00
wernsaar
711ecb8bd5 Merge pull request #797 from wernsaar/develop
bugfixes for lapack and lapacke
2016-03-07 16:44:17 +01:00
Werner Saar
10c2ebdfc5 BUGFIX: removed fixes for bugs #148 and #149, because info for xerbla is wrong 2016-03-07 10:34:04 +01:00
Werner Saar
26b3b3a3e6 bugfixes form lapack svn for bugs #142 - #155 2016-03-07 10:10:00 +01:00
Werner Saar
acdff55a6a Bugfix for ztrmv 2016-03-07 09:39:34 +01:00
Zhang Xianyi
7d6b68eb4a Refs #786. Revert to default assembly kernel. 2016-03-07 11:34:58 +08:00
Werner Saar
0bbca5e803 removed build of smallscaling, because build on arm, arm64 and power fails 2016-03-06 11:54:41 +01:00
Werner Saar
cd5241d0cf modified KERNEL for power, to use the generic DSDOT-KERNEL 2016-03-06 09:07:24 +01:00
Werner Saar
8d652f11e7 updated smallscaling.c to build without C99 or C11
increased the threshold value of nep.in to 40
2016-03-06 08:40:51 +01:00
Zhang Xianyi
6c86570e1f Merge pull request #790 from jeromerobert/bug786
ztrmv_L.c: no longer need a 4kB buffer
2016-03-05 15:25:27 -05:00
Jerome Robert
53ba1a77c8 ztrmv_L.c: no longer need a 4kB buffer
Fix #786
2016-03-05 19:07:03 +01:00
Zhang Xianyi
d23c7c713c Fixed #789 Fix utest/ctest.h on Mingw. 2016-03-05 09:34:37 -05:00
Zhang Xianyi
8c43d7fa5f Merge remote-tracking branch 'origin/power8' into develop
Refs #774
2016-03-05 06:03:19 -05:00
Werner Saar
085f215257 Modified assembly label name, so that they are hidden.
Added license informations.
2016-03-05 10:27:27 +01:00
Zhang Xianyi
8f758eeff9 Refs #786. avoid old assembly c/zgemv kernels. 2016-03-05 08:32:03 +08:00
Werner Saar
0afc76fd65 enabled gemm_beta assembly kernels 2016-03-04 15:01:15 +01:00
Werner Saar
91e1c5080c modified configuration, to use power6 sgemm kernel for power8 2016-03-04 13:38:57 +01:00
Werner Saar
73f04c2c72 enabled hemv assemly function for power8 2016-03-04 13:20:50 +01:00
Werner Saar
3e633152c6 enabled symv assembly kernels on power8 2016-03-04 13:08:18 +01:00
Werner Saar
d5130ce7e3 enabled gemv assembly on power8 2016-03-04 12:53:31 +01:00
Werner Saar
4824b88fcb enabled all level1 assembly kernels for power8 2016-03-04 12:35:25 +01:00
Werner Saar
cc26d888b8 BUGFIX: increased BUFFER_SIZE for POWER8 2016-03-04 10:26:53 +01:00
Zhang Xianyi
8577be2a95 Modify travis script. 2016-03-04 04:24:43 +08:00
Zhang Xianyi
1edf30b790 Change Opteron(SSE3) to Opteron_SSE3 at dyanmaic core name. 2016-03-01 20:13:08 +08:00
Werner Saar
b752858d6c added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8 2016-03-01 07:33:56 +01:00
Zhang Xianyi
4fc8c937d4 Refs #695 add testcase. 2016-03-01 01:05:56 -05:00
Zhang Xianyi
efa4f5c936 Refs #695 #783. Replace default x86_64 cgemv_t
asm kernel by C kernel.
2016-03-01 11:18:56 +08:00
Zhang Xianyi
17d655fa64 Merge pull request #784 from peterph/develop
collected usage notes
2016-02-27 11:24:20 -05:00
Petr Cerny
f68141cf1d collected usage notes 2016-02-27 16:57:22 +01:00
Zhang Xianyi
aa90518201 Update Changelog for 0.2.16.rc1. 2016-02-24 15:21:22 -05:00
Zhang Xianyi
6b85dbb6dc Refs #696. Turn off stack limit setting on Linux.
I cannot reproduce SEGFAULT of lapack-test with default stack size
on ARM Linux.
2016-02-24 14:21:42 -05:00
Zhang Xianyi
a0debd4293 Refs #696. Turn off stack limit setting on Linux.
I cannot reproduce SEGFAULT of lapack-test with default stack size
on ARM Linux.
2016-02-24 14:18:39 -05:00
Zhang Xianyi
937493bfeb Release 0.2.16 rc1 2016-02-23 18:29:21 -05:00
Zhang Xianyi
74b0672223 Fix c/zaxpyc kernel bug on Cortex-A57. 2016-02-23 22:47:53 +00:00
Zhang Xianyi
6e7be06e07 Refs JuliaLang/julia#5728. Fix gemv performance bug on Haswell Mac OSX.
On Mac OS X, it should use .align 4 (equal to .align 16 on Linux).
I didn't get the performance benefit from .align. Thus, I deleted it.
2016-02-19 17:56:07 -05:00
Zhang Xianyi
a04d0555ba [av skip] Fix utest makefile bug on travis ci. 2016-02-20 00:21:43 +08:00
Zhang Xianyi
3761c30ba4 Fix makefile bug for utest. 2016-02-18 17:01:48 -05:00
Zhang Xianyi
38593cd3a3 Fix compiling bug on ARM Cortex-A57. 2016-02-13 15:38:52 +00:00
Zhang Xianyi
e3b7781c2b Update readme. 2016-02-13 00:33:53 +08:00
Zhang Xianyi
5e6965ea47 Run utest when building. 2016-02-13 00:33:31 +08:00
Zhang Xianyi
5cc0301fc3 Enable utest for appveyor. 2016-02-12 01:50:20 -05:00
Zhang Xianyi
19a6dedfd6 Add utest for CMake. 2016-02-12 05:38:13 +08:00
Zhang Xianyi
0e2b92e216 Added mising lapacke files for CMake. 2016-02-12 05:28:16 +08:00
Zhang Xianyi
d06b92906a Add gemm3m building for CMake. 2016-02-12 05:02:51 +08:00
Zhang Xianyi
8e98478ff3 Update ctest.h from github.com:xianyi/ctest.git. 2016-02-12 05:01:57 +08:00
Zhang Xianyi
fb8968fb83 Refs #707. Bugfix for previous commit. 2016-02-11 05:14:53 +08:00
Zhang Xianyi
dae6b82a71 Refs #707. Add BUILD_LAPACK_DEPRECATED flag in Makefile.rule.
If you want to build LAPACK deprecated functions since LAPACK 3.6.0

make BUILD_LAPACK_DEPRECATED=1
2016-02-11 04:22:53 +08:00
Zhang Xianyi
d73244b825 Refs #727. Align stack buffer address on 32-bytes. 2016-02-11 03:52:02 +08:00
Zhang Xianyi
233c6b959f Merge pull request #780 from jeromerobert/bug727
Bug727
2016-02-08 13:24:40 -05:00
Jerome Robert
16ec5323c9 Fix zgemv.c compilation when stack allocation is disabled 2016-02-08 12:05:02 +01:00
Jerome Robert
0ad02ef2d6 update CONTRIBUTORS.md 2016-02-08 11:26:51 +01:00
Jerome Robert
73397faf68 Add benchmark/smallscaling.c
* Bench small matrices with multi-threading
* Close #727
2016-02-08 11:25:27 +01:00
Jerome Robert
5fc2203d8a zgemv: Add a workaround for #746 2016-02-08 11:25:15 +01:00
Jerome Robert
78dcf5c3d5 Improve performances of ztrmv on small matrices
* Use stack allocation
* Disable multi-threading
* Ref #727
2016-02-08 11:25:02 +01:00
Jerome Robert
32f793195f Use stack allocation in zgemv and zger
For better performance with small matrices
Ref #727
2016-02-08 11:24:21 +01:00
Zhang Xianyi
be4e5fcd20 Fixed #778. Merge branch 'buffer51-develop' into develop 2016-02-05 08:39:08 +08:00
buffer51
855e0cb700 Restored LAPACK_COMPLEX_STRUCTURE for Android prior to 21. Refs #682. 2016-02-04 17:20:07 -05:00
buffer51
7f7d04dcd2 Fixed linking error when compiling ARMv7 for Android (disabled -lpthread and added -Wl,--no-warn-mismatch). 2016-02-04 17:05:31 -05:00
buffer51
4e1b521e27 Fix lapack complex implementation of lauu2 and potf2 for Android (use FLOAT instead of FLOAT[2] as imaginary part is not used). 2016-02-04 16:59:56 -05:00
Zhang Xianyi
a1a96589aa Fixed #773 blas_quickdivide bug on CMake and Visual Studio x86 32-bit. 2016-02-04 15:23:32 -05:00
Zhang Xianyi
0e68beb89f Fixed #711, #698. Merge branch 'byzhang-develop' into develop 2016-02-03 02:56:27 +08:00
Zhang Xianyi
926ba8b7ca Merge branch 'develop' of https://github.com/byzhang/OpenBLAS into byzhang-develop 2016-02-03 02:48:32 +08:00
Zhang Xianyi
9f080c47e1 Merge pull request #743 from tkelman/patch-1
re enable Fortran optimization flag on windows
2016-02-02 13:46:12 -05:00
Zhang Xianyi
52eba814ce Fixed #769. Merge branch 'martin-frbg-develop' into develop 2016-02-02 13:43:51 -05:00
Martin Kroeker
935356c34f Update dynamic.c and cpuid_x86.c for Intel Avoton.
Second part of "support Intel Avoton via Nehalem kernel"
2016-02-02 13:42:55 -05:00
Zhang Xianyi
ff9388d625 Refs #768. Swap the result of zdot x87 fp kernel. 2016-02-02 13:38:01 -05:00
Martin Kroeker
4f05c23673 Update cpuid_x86.c
Add recognition of Intel Atom C27xx (Avoton, model code 4D)
2016-02-02 13:38:01 -05:00
Benyu Zhang
4a1263f609 Fix the source paths 2016-02-01 18:32:42 -08:00
Zhang Xianyi
962376664d Refs #768. Swap the result of zdot x87 fp kernel. 2016-02-02 09:15:02 +08:00
Tony Kelman
5fef0d1b75 re enable Fortran optimization flag on windows
partial revert of 299cdcdc29
from #696, was not explained why that was needed
2016-01-30 01:13:51 -08:00
Zhang Xianyi
578f471808 Fix utest bug when INTERFACE64=1. 2016-01-28 22:18:38 -06:00
Zhang Xianyi
5a8447e97e Use ctest.h for unit test. Enable unit test on travis CI. 2016-01-29 11:35:31 +08:00
Zhang Xianyi
be95bdaf47 Detect ARMV8 on 32-bit mode by using ARMV7 kernels. 2016-01-28 17:30:26 +00:00
Zhang Xianyi
c44ff4d648 Refs #714. avoid compiling warnings. 2016-01-28 04:38:07 +08:00
Zhang Xianyi
e003a1294c Merge pull request #764 from martin-frbg/develop
Update Makefile.system to fix awk/nawk issue #763
2016-01-26 14:03:27 -06:00
Martin Kroeker
44062517eb Update Makefile.system
Define AWK as "nawk" for SunOS (actually Illumos) only - fixes #763
2016-01-26 20:35:25 +01:00
Zhang Xianyi
13f0f8c10e Refs #723. Avoid out of boundary for getf2. 2016-01-26 09:14:57 -06:00
Zhang Xianyi
f5df444ceb Merge pull request #762 from jeromerobert/bug760
Let openblas_get_num_threads return the number of active threads
2016-01-26 08:45:16 -06:00
Zhang Xianyi
e382713423 Merge pull request #759 from jeromerobert/bug742
Bug742
2016-01-26 08:43:32 -06:00
Zhang Xianyi
aaa8551c57 Merge pull request #749 from lotheac/illumos_fixes
illumos fixes
2016-01-26 08:42:20 -06:00
Jerome Robert
0d87c1ffb6 Let openblas_get_num_threads return the number of active threads
... not the number of allocated threads.

Close #760
2016-01-26 13:04:16 +01:00
wernsaar
0b194426f8 Merge pull request #761 from wernsaar/develop
Ref #740: all assembly codes now clear floating point register correctly
2016-01-26 09:19:14 +01:00
Werner Saar
63a7d7fb24 updated gemv_n_vfpv3.S for armv7 2016-01-25 15:00:13 +01:00
Werner Saar
b4ede558a5 updated nrm2 kernel for armv7 2016-01-25 11:55:25 +01:00
Werner Saar
de3e2d4349 updated trmm kernels for armv7 2016-01-25 11:08:56 +01:00
Werner Saar
a0e51e96f1 updated gemm kernels for armv7 2016-01-25 10:46:10 +01:00
Lauri Tirkkonen
d6afac9624 don't pass -Y at all to the linker on illumos
the illumos linker can't understand the "-Y/lib"... form that f_check
generates, and -Wl cannot pass options that include commas
2016-01-25 11:09:34 +02:00
Werner Saar
c2891330bc updated KERNEL.ARMV6 2016-01-24 17:12:07 +01:00
Werner Saar
ceaa931e48 updated gemv kernel for armv6 2016-01-24 16:31:19 +01:00
Werner Saar
eaa63165df updated cgemv and zgemv kernels for armv6 2016-01-24 14:42:38 +01:00
Werner Saar
c65357c566 updated trmm_kernels for armv6 2016-01-24 13:03:33 +01:00
Werner Saar
e63e9f9f26 updated gemm_kernels for armv6 2016-01-24 11:55:50 +01:00
Jerome Robert
1fe3aab047 Use GEMM_MULTITHREAD_THRESHOLD as a number of ops
...not a matrix size. For GEMM_MULTITHREAD_THRESHOLD=4
(the default value) this does not change anything but
for other values it make the GEMM and GEMV thresholds
changing in the same way.

Close #742
2016-01-24 11:31:40 +01:00
Werner Saar
aafd3ab60e updated cdot and zdot on arm 2016-01-24 10:56:49 +01:00
Jerome Robert
1a1935507b [z]ger: increase multithread threshold
The ones given in 3ae30cd was by far to low because I
mixed m and m*n in my measures. Note that the new ones
are closed to the [z]gemv ones which is comforting
that both are right.
2016-01-24 10:46:35 +01:00
Werner Saar
d2f84c9c8a Ref #740: updated nrm2_vfp.S 2016-01-23 17:47:58 +01:00
Werner Saar
ca32253f32 Ref #740: updated asum_vfp.S and iamax_vfp.S 2016-01-23 14:44:34 +01:00
Werner Saar
9066d1f982 Ref #750 and Ref #740 : bugfix for sdot, dsdot and ddot on arm 2016-01-23 11:59:51 +01:00
Lauri Tirkkonen
8d85be770d actually install the shared lib on illumos 2016-01-22 18:56:03 +02:00
Lauri Tirkkonen
7ba1d9b9ca actually build the shared lib on illumos 2016-01-22 18:55:59 +02:00
Lauri Tirkkonen
31aff441ce use $(AWK) in Makefile.install and switch it to nawk 2016-01-22 18:55:55 +02:00
Lauri Tirkkonen
e737e32fd1 RLIMIT_NPROC doesn't exist on illumos 2016-01-22 18:55:51 +02:00
Lauri Tirkkonen
8635d425c1 make parallel make work on illumos 2016-01-22 18:55:48 +02:00
Lauri Tirkkonen
97cd4b8aee illumos fixes to memory.c 2016-01-22 18:55:43 +02:00
wernsaar
72390e3ffb Merge pull request #747 from wernsaar/develop
Ref #730: added performance updates for syrk and syr2k
2016-01-21 14:21:59 +01:00
Werner Saar
b07d733a71 added updates for syrk and syr2k 2016-01-21 13:16:44 +01:00
Zhang Xianyi
fa3018c30e Merge pull request #745 from jakirkham/minor_fix_scipy_prof
BENCH: Minor fixes in SciPy benchmarks
2016-01-20 11:24:22 -06:00
Zhang Xianyi
6caa40302e Merge pull request #744 from jeromerobert/bug731
Bug731
2016-01-20 11:18:21 -06:00
John Kirkham
a48b247e9e benchmark/scripts/SCIPY/dsyrk.py: Overwrite will work on a Fortran array of the correct type. 2016-01-19 15:32:28 -05:00
John Kirkham
b1b115ecd6 benchmark/scripts/SCIPY/ssyrk.py: Overwrite will work on a Fortran array of the correct type. 2016-01-19 15:31:37 -05:00
John Kirkham
07bba933ff benchmark/scripts/SCIPY/dsyrk.py: Arrays should be Fortran order. 2016-01-19 15:29:43 -05:00
John Kirkham
e85f8af519 benchmark/scripts/SCIPY/ssyrk.py: Arrays should be Fortran order. 2016-01-19 15:28:22 -05:00
John Kirkham
adfa0ab878 benchmark/scripts/SCIPY/ssyrk.py: Fix PEP8 issues. 2016-01-19 15:06:17 -05:00
John Kirkham
cbb6649e97 benchmark/scripts/SCIPY/dsyrk.py: Fix PEP8 issues. 2016-01-19 15:05:18 -05:00
John Kirkham
77abc9b280 benchmark/scripts/SCIPY/ssyrk.py: Write values into C. 2016-01-19 15:00:54 -05:00
John Kirkham
81e8690763 benchmark/scripts/SCIPY/dsyrk.py: Write values into C. 2016-01-19 15:00:23 -05:00
John Kirkham
dd04a8ac22 benchmark/scripts/SCIPY/ssyrk.py: Use the environment python. 2016-01-19 14:05:14 -05:00
John Kirkham
cb554b3a9c benchmark/scripts/SCIPY/dsyrk.py: Use the environment python. 2016-01-19 14:04:55 -05:00
John Kirkham
1153459d1b benchmark/scripts/SCIPY/ssyrk.py: Drop unneeded semicolons. 2016-01-19 14:00:51 -05:00
John Kirkham
1a73390ffe benchmark/scripts/SCIPY/dsyrk.py: Drop unneeded semicolons. 2016-01-19 14:00:51 -05:00
John Kirkham
8b981e41a1 benchmark/scripts/SCIPY/ssyrk.py: Allocate C using zeros instead of randomly generating it. 2016-01-19 14:00:48 -05:00
John Kirkham
c10b1f555d benchmark/scripts/SCIPY/dsyrk.py: Allocate C using zeros instead of randomly generating it. 2016-01-19 14:00:26 -05:00
Jerome Robert
14db1ca508 update CONTRIBUTORS.md 2016-01-19 17:15:31 +01:00
Jerome Robert
66eafb16cf swap: disable multi-threading for small matrices
Close #731
2016-01-19 17:14:46 +01:00
Jerome Robert
3ae30cd6b9 Disable multi-threading for small matrices in [z]ger
Ref #731
2016-01-19 17:14:31 +01:00
Werner Saar
692d9c881c Ref #740: simple solution to clear floating point register on arm 2016-01-17 15:37:12 +01:00
Zhang Xianyi
055b481386 Fixed CMake bug for single core. 2016-01-15 06:42:54 +08:00
Zhang Xianyi
ce2b1edd4e [av skip] Change test cmd on Travis. 2016-01-12 20:44:49 -06:00
Zhang Xianyi
8cf3657fb6 Refs #738. Fix previous commit bug. Run BLAS and CBLAS test on Travis. 2016-01-12 20:01:49 -06:00
Zhang Xianyi
44222a7fe0 Refs #738. Run test on Travis. 2016-01-12 22:52:47 +00:00
Zhang Xianyi
3ac153180c Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2016-01-12 22:25:36 +00:00
Zhang Xianyi
96b486acee Merge branch 'jeromerobert-bug736' into develop 2016-01-12 22:25:08 +00:00
Zhang Xianyi
3602a2cd1f #736 Revert #733 patch to fix bus error on ARM. 2016-01-12 22:19:58 +00:00
Zhang Xianyi
b65de4947a Merge pull request #739 from sebastien-villemot/develop
Fixes for old outstanding bugs in CBLAS test programs
2016-01-12 14:47:34 -06:00
Sébastien Villemot
04ad946fc8 Fix output descriptors of c_{s,d,c,z}blat3
The NTRA argument can be equal to -1 if one does not want a snapshot file
(and this is the case with sample data {s,d,c,z}in3).
The routines {S,D,C,Z}PRCN3 will try to use their first argument as an output
unit number, so we avoid calling them when NTRA < 0.

Patch originally written by Camm Maguire.
2016-01-11 11:22:17 +01:00
Sébastien Villemot
f704b8d32f Fix CBLAS double complex level 2 tests
The SNAME variable contains names of C functions like "cblas_dgemv".
Apparently the code was not taking into account the 6-letter "cblas_"
prefix when determining the task to be done.

The issue does not affect c_{s,d,c}blat2.f, which use the correct
offsetting.

Patch originally written by Camm Maguire.
2016-01-11 11:15:33 +01:00
Jerome Robert
708ad330ac stack alloc: Fix stack smashing detection in 32bits
* Fix commit 87a2ccc
* Close #736
2016-01-10 19:04:37 +01:00
Werner Saar
c6a27bbe64 added benchmark tests for ssyrk and dsyrk 2016-01-10 12:19:03 +01:00
Zhang Xianyi
f16b4f10b6 Merge pull request #734 from jeromerobert/common_stackalloc
Factorize MAX_STACK_ALLOC code to common_stackalloc.h
2016-01-08 22:13:37 -06:00
Jerome Robert
87a2ccc37c Factorize MAX_STACK_ALLOC code to common_stackalloc.h
Ref #727
2016-01-08 16:03:52 +01:00
Zhang Xianyi
e3e20e2242 Merge pull request #733 from yuyichao/arm-asm
Do not use vsub to clear the register values
2016-01-05 19:35:12 -06:00
Yichao Yu
594b9f4c73 Do not use vsub to clear the register values since it doesn't work with non-normal numbers. 2016-01-05 16:54:05 +00:00
wernsaar
c96c6a26fd Merge pull request #732 from wernsaar/develop
added optimized trsm_kernels
2016-01-05 15:34:08 +01:00
Werner Saar
c8f2c5d636 added optimized trsm_kernels 2016-01-05 13:05:05 +01:00
Werner Saar
5f2fa15e04 include sched.h if OS is Android 2016-01-05 12:36:49 +01:00
Zhang Xianyi
7d144aaabc Merge pull request #728 from jeromerobert/fix-no-stack-alloc
Fix make MAX_STACK_ALLOC=0
2016-01-04 15:04:24 -06:00
Jerome Robert
f9890a6452 Fix compilation when MAX_STACK_ALLOC is not set
Close #722
2015-12-31 14:43:09 +01:00
Jerome Robert
2c7143459f Let make MAX_STACK_ALLOC=0 do what expected
It's no longer required to modify Makefile.rule to disable
stack allocation. It's now possible to run:

make MAX_STACK_ALLOC=0
2015-12-31 14:43:09 +01:00
Zhang Xianyi
3857581adf Merge pull request #726 from jeromerobert/amd-e2-3200
Fix detection of AMD E2-3200
2015-12-28 12:53:11 -06:00
Zhang Xianyi
e9754e6250 Merge pull request #725 from jeromerobert/make-nb-jobs
Allow to force the number of parallel make job
2015-12-28 12:48:49 -06:00
Jerome Robert
76398c3233 Fix detection of AMD E2-3200 2015-12-28 19:45:47 +01:00
Jerome Robert
ba024fcfc0 Allow to force the number of parallel make job
This is particularly useful when using distcc
2015-12-28 19:45:29 +01:00
Zhang Xianyi
b9b52c295d Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2015-12-14 10:07:10 -06:00
Zhang Xianyi
285d042b10 Fixed rotg bug on ARM. 2015-12-14 10:07:01 -06:00
Zhang Xianyi
01db7908b8 Merge pull request #713 from btracey/patch-2
Fix Dormbr to perform the correct size operations with RowMajor
2015-12-10 10:13:49 -06:00
Zhang Xianyi
5f75df40d5 Merge pull request #711 from btracey/patch-1
Fix Dormlq to perform the correct size operations with RowMajor
2015-12-10 10:13:12 -06:00
Brendan Tracey
b3f100dc25 Fix Dormbr to perform the correct size operations with RowMajor
Fixes issue #712
2015-12-09 00:50:22 -07:00
Brendan Tracey
2f65aad626 Fix Dormlq to perform the correct size operations with RowMajor
Fixes issue #615.
2015-12-08 22:34:21 -07:00
Zhang Xianyi
25116788ef Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2015-12-05 00:46:42 +08:00
Zhang Xianyi
958f0de65e Refs #708. Modified config template for MSVC. 2015-12-05 00:45:29 +08:00
Zhang Xianyi
5d212f66a7 Refs #706. Fixed lapacke installation error. 2015-12-03 01:32:39 +08:00
Zhang Xianyi
f88ee18409 Merge pull request #704 from tkelman/patch-1
fix makefile warning when renaming symbols
2015-11-30 22:37:25 -06:00
Tony Kelman
d22917a58a fix makefile warning when renaming symbols
use different names for `openblas*.renamed` between osx and other unices, fixes
```
Makefile:121: warning: overriding commands for target `../libopenblas64_p-r0.2.15.a.renamed'
Makefile💯 warning: ignoring old commands for target `../libopenblas64_p-r0.2.15.a.renamed'
```

also clean `*.renamed`
2015-11-30 20:16:33 -08:00
Zhang Xianyi
640cccc2b1 Refs #697. Fixed gemv bug for Windows.
Thank matzeri's patch.
2015-11-30 15:19:45 -06:00
Zhang Xianyi
fba6532502 Refs #702. Delete redundant xerbla exporting 2015-11-30 11:08:33 -06:00
Zhang Xianyi
da7f69e8f4 Refs #699. Split the obj list of LAPACKE 3.6.0. 2015-11-24 13:15:28 -06:00
Zhang Xianyi
044fb91ea5 Merge pull request #690 from rayglover/msvc-fix
(Visual Studio) Don't use C99 complex numbers when building C++ code.
2015-11-23 11:05:37 -06:00
Zhang Xianyi
b4380acf77 Merge pull request #696 from ashwinyes/develop_20151120_lapack_test_fixes
Cortex A57 fixes and Lapack 3.6.0
2015-11-23 11:04:42 -06:00
Werner Saar
d1dd4e302e fix for bad or outdated mingw compiler 2015-11-23 16:20:14 +01:00
Ashwin Sekhar T K
318f0949c3 lapack-test fixes in nrm2 kernels for Cortex A57 2015-11-23 13:43:36 +05:30
Werner Saar
299cdcdc29 lapack fixes for Windos 2015-11-21 14:33:27 +01:00
Werner Saar
a8516c5b47 fixes for cross compile 2015-11-21 10:48:37 +01:00
Werner Saar
c40538eaeb bugfix for cross compiling 2015-11-20 13:47:22 +01:00
Werner Saar
33e37d01b3 added lapack-3.6.0 2015-11-20 09:45:46 +01:00
Werner Saar
64db4576e6 removed lapack-3.5.0 2015-11-20 09:41:59 +01:00
Werner Saar
0d22551a6b increase the stack size limit in the constructor 2015-11-20 09:23:01 +01:00
Ashwin Sekhar T K
1d121852c1 Fix blas_lock for arm64 2015-11-20 01:45:35 +05:30
Ashwin Sekhar T K
98965da2e8 lapack-test fixes for Cortex A57 2015-11-20 01:15:04 +05:30
Ashwin Sekhar T K
39937d15cd Change BUFFER_SIZE for Cortex A57 to 20 MB
Change the GEMM_P, GEMM_Q, GEMM_R values for Cortex A57
2015-11-20 01:12:04 +05:30
Ray Glover
a9d7eee0dc (Visual Studio) Don't use C99 complex numbers when building C++ code. 2015-11-17 17:29:30 +00:00
Zhang Xianyi
e31948ceb0 Fix #686. Merge branch 'ashwinyes-develop' into develop 2015-11-11 04:30:26 +08:00
Zhang Xianyi
233ec2a1cc Use 40 MB buffer for ARM Cortex A57. 2015-11-11 04:22:34 +08:00
Zhang Xianyi
a4c6a88a65 Delete vi swap file. 2015-11-11 04:19:43 +08:00
Zhang Xianyi
faf0811483 Merge branch 'develop' of https://github.com/ashwinyes/OpenBLAS into ashwinyes-develop 2015-11-11 04:16:22 +08:00
Zhang Xianyi
4e4a3e783f Update develop version. 2015-11-11 04:14:58 +08:00
Zhang Xianyi
d00ada378f Merge pull request #684 from sebastien-villemot/develop
Fix detection of POWER architecture in c_check.
2015-11-09 11:39:21 -06:00
Sébastien Villemot
41407acc19 Fix detection of POWER architecture in c_check.
This is necessary to avoid the false detection of a cross-compiling
environment.
2015-11-09 18:36:04 +01:00
Ashwin Sekhar T K
67874468a6 Fix bug in benchmark/gemm.c 2015-11-09 14:15:54 +05:30
Ashwin Sekhar T K
c99c43d51e Optimized trmm kernels for CORTEXA57 2015-11-09 14:15:54 +05:30
Ashwin Sekhar T K
1397b47197 Optimized zgemm kernel for CORTEXA57 2015-11-09 14:15:53 +05:30
Ashwin Sekhar T K
45f78963ac Optimized cgemm kernel for CORTEXA57
Also, add a generic ztrmm 4x4 kernel
2015-11-09 14:15:53 +05:30
Ashwin Sekhar T K
402443bf9c Optimized dgemm kernel for CORTEXA57 2015-11-09 14:15:53 +05:30
Ashwin Sekhar T K
19fdbee291 Improve the sgemm kernel for CORTEXA57 2015-11-09 14:15:53 +05:30
Ashwin Sekhar T K
3b0cdfab1e Optimized gemv kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:52 +05:30
Ashwin Sekhar T K
46efa6a1da Optimized swap kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:52 +05:30
Ashwin Sekhar T K
ea1465cdf8 Optimized scal kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:52 +05:30
Ashwin Sekhar T K
fb4be3b3eb Optimized rot kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:52 +05:30
Ashwin Sekhar T K
6c2f4ddbcd Optimized nrm2 kernels for CORTEXA57 2015-11-09 14:15:51 +05:30
Ashwin Sekhar T K
870c4d49c0 Optimized dot kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:51 +05:30
Ashwin Sekhar T K
cd7684097c Optimized copy kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:51 +05:30
Ashwin Sekhar T K
2690b71b1f Optimized axpy kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:51 +05:30
Ashwin Sekhar T K
3e4acedf0e Optimized asum kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:51 +05:30
Ashwin Sekhar T K
2610752dbb Optimized iamax kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:50 +05:30
Ashwin Sekhar T K
dbb213655e Optimized amax kernels for CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:50 +05:30
Ashwin Sekhar T K
9742dba595 Fix compiler errors in common.h 2015-11-09 14:15:50 +05:30
Ashwin Sekhar T K
f2f8a0fe8b Adding arm64 target CORTEXA57
Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>
2015-11-09 14:15:50 +05:30
Ralph Campbell
55a0b27c01 Minor C code fixes in interface/ 2015-11-09 14:15:49 +05:30
Ralph Campbell
fbc21266e6 Minor C code fixes in driver/ 2015-11-09 14:15:49 +05:30
Ralph Campbell
c053559ed9 Minor C code fixes in kernel/arm 2015-11-09 14:15:49 +05:30
Ralph Campbell
55e4332f00 Remove duplicate -D args in kernel/Makefile.L1 2015-11-09 14:15:48 +05:30
Zhang Xianyi
a550431ee6 Refs #682. Enable LAPACK_COMPLEX_STRUCTURE when __ANDROID_API_ < 21. 2015-11-06 23:46:20 -06:00
Zhang Xianyi
3e8d6ea74f Init POWER8 kernels by POWER6. 2015-11-03 12:34:23 +08:00
Zhang Xianyi
839395fc25 Detect AMD Trinity and Richland. 2015-10-29 02:53:29 +08:00
Zhang Xianyi
1331642f24 Merge pull request #677 from j-bo/develop
Refs #676.  Fixed ONLY_CBLAS=1 compiling bug on windows.
2015-10-28 09:44:25 -05:00
j-bo
1e0bbea868 Refs #676. Fixed ONLY_CBLAS=1 compiling bug on windows. 2015-10-28 15:10:42 +01:00
Zhang Xianyi
53e849f4fc Merge branch 'develop' 2015-10-27 15:44:50 -05:00
Zhang Xianyi
8447498b50 Update doc for OpenBLAS 0.2.15 version. [CI skipped] 2015-10-27 15:44:35 -05:00
Zhang Xianyi
b6519159f5 Merge pull request #674 from j-bo/develop
Fix #673
2015-10-27 10:49:22 -05:00
Zhang Xianyi
63c56d3da9 Only include complex.h since Android 5.0 2015-10-27 10:47:55 -05:00
j-bo
718d0f18e3 Merge pull request #1 from j-bo/j-bo-patch-673
Fix #673
2015-10-27 13:56:45 +01:00
j-bo
6040858b22 Fix #673
Add lacking headers declarations when compiling for Android ARM7
2015-10-27 13:55:24 +01:00
Zhang Xianyi
70642fe4ed Refs #668. Raise the signal when pthread_create fails.
Thank James K. Lowden for the patch.
2015-10-26 19:02:51 -05:00
Zhang Xianyi
79d4a62e10 Add AppVeyor badge. 2015-10-26 18:14:41 -05:00
Zhang Xianyi
1ac8c32f1d [ci skip] Build Visual Studio 12 Win64 on Appveyor 2015-10-26 18:08:54 -05:00
Zhang Xianyi
0b2ad98e48 Only test x64 Windows CI. 2015-10-27 05:11:07 +08:00
Zhang Xianyi
69363622a8 Fix DYNAMIC_ARCH=1 bug. 2015-10-27 05:10:40 +08:00
Zhang Xianyi
e6d754fddc Use AppVeyor Windows CI. 2015-10-26 15:08:17 -05:00
Zhang Xianyi
f74ff6da38 Merge branch 'cmake' into develop 2015-10-26 14:54:59 -05:00
Zhang Xianyi
2feef49fa8 Merge branch 'develop' into cmake
Conflicts:
	driver/others/memory.c
2015-10-26 14:54:34 -05:00
Zhang Xianyi
53b6023a6c Fix cmake bug on MSVC 32-bit. 2015-10-26 14:52:13 -05:00
Zhang Xianyi
309875de3c Fix cmake bug on x86 32-bit.
e.g. Build 32-bit on 64-bit Linux.
cmake -DBINARY=32
2015-10-27 02:54:53 +08:00
Zhang Xianyi
b809f99cee Add CBLAS test for CMAKE. 2015-10-26 23:42:21 +08:00
Zhang Xianyi
5a291606ad Refs #671. the return of i?max cannot larger than N. 2015-10-24 01:16:34 +08:00
Zhang Xianyi
1ce054fcb3 Refs #669. Fixed the build bug with gcc on Mac OS X. 2015-10-22 11:07:35 -05:00
Zhang Xianyi
8fade093aa Fixed cmake bug on Visual Studio. 2015-10-20 14:37:22 -05:00
Zhang Xianyi
96f0bbe067 Fixed cmake bug on haswell. 2015-10-21 02:24:54 +08:00
Zhang Xianyi
d8392c1245 Fixe cmake config bugs. 2015-10-20 04:30:55 +08:00
Zhang Xianyi
aca7d7e953 Detect cmake test result. 2015-10-20 03:35:25 +08:00
Zhang Xianyi
94b125255f Merge branch 'develop' into cmake
Conflicts:
	driver/others/memory.c
2015-10-13 04:46:08 +08:00
Zhang Xianyi
3684706a12 Include time.h. 2015-10-08 15:18:54 +00:00
Zhang Xianyi
90aa8e24b9 Refs #615. Import bug fixes for LAPACKE dormlq. 2015-10-07 02:31:51 +08:00
Zhang Xianyi
11ac4665c8 Fixed #654. Make sure the gotoblas_init function is run before all other static initializations. 2015-10-05 14:14:32 -05:00
Zhang Xianyi
c666158b79 Merge pull request #656 from stevengj/libname
default to lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)
2015-10-05 10:25:15 -05:00
Zhang Xianyi
ccf581f94d Merge pull request #659 from Keno/patch-2
Fix cross compilation suffix detection
2015-10-05 10:23:52 -05:00
Keno Fischer
e9493f69eb Fix cross compilation suffix detection
If the path involves `-`, this would have otherwise detected this as a cross compile suffix.
2015-10-05 00:58:07 -04:00
Steven G. Johnson
88bef3bffc default to lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX), as discussed in #646: if you rename the symbols, it is best to rename the library 2015-10-01 15:07:04 -04:00
Zhang Xianyi
f27942a68a Fixed make TARGET=CORTEXA9 and CORTEXA15 bug. 2015-09-26 14:42:44 +00:00
Zhang Xianyi
0cc2b3de0b Merge pull request #652 from larsmans/fixes
Tiny fixes
2015-09-22 10:01:59 -05:00
Lars Buitinck
b9534bbd76 git ignore versioned .so files 2015-09-22 12:01:09 +02:00
Lars Buitinck
45c8b5e756 actually remove cblas_noconst.h
This file hasn't been used since 212463dce9.
2015-09-22 12:00:30 +02:00
Zhang Xianyi
a96a4cb012 Merge pull request #640 from kortschak/dlansy-fix
Fix LAPACK_*lansy routines
2015-09-10 10:36:57 -05:00
Zhang Xianyi
baec8f5cac Refs #638. Fixed compiling bug with clang on Mac OS X. 2015-09-10 10:32:07 -05:00
kortschak
d6e8459f20 Fix LAPACK_*lansy routines
Fixes #639.
2015-09-10 15:32:50 +09:30
Zhang Xianyi
dfe1eef33b Merge branch 'yuyichao-skylake-id' into develop 2015-09-09 10:48:15 -05:00
Zhang Xianyi
cc7cab8a45 Detect other Intel Skylake cores.
http://users.atw.hu/instlatx64/
2015-09-09 10:47:17 -05:00
Yichao Yu
61ae47eb99 Ref #632. Support Intel Skylake by Haswell kernels. 2015-09-09 11:07:33 -04:00
Zhang Xianyi
22353b1727 Merge pull request #634 from kortschak/lantr-trans-prep
Fix lantr preparation for row major matrices
2015-09-09 09:56:07 -05:00
kortschak
efffd28739 Fix lantr preparation for row major matrices 2015-09-09 09:25:48 +09:30
Zhang Xianyi
62cabef857 Merge pull request #633 from grisuthedragon/tune_imatcopy
Improved Ximatcopy when lda==ldb.
2015-09-08 13:59:08 -05:00
Martin Koehler
711ca33bc6 Improved Ximatcopy when lda==ldb.
The Ximatcopy functions create a copy of the input matrix
although they seem to work inplace. The new routines
XIMATCOPY_K_YY perform the operations inplace if the leading
dimension does not change.
2015-09-07 14:36:16 +02:00
Zhang Xianyi
40a3fed6b8 Merge pull request #630 from buffer51/develop
Fixed error in common.h for Android compilation introduced by e12cf11
2015-09-04 13:01:01 -05:00
buffer51
2297a2d989 Fixed error in common.h for Android compilation introduced by e12cf1123e 2015-09-03 20:54:21 -04:00
Zhang Xianyi
5408074941 Add notification. 2015-08-19 22:50:25 -05:00
Zhang Xianyi
bbcdf63bb4 Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2015-08-19 22:48:55 -05:00
Zhang Xianyi
43eabab62f Merge pull request #619 from gitter-badger/gitter-badge
Add a Gitter chat badge to README.md
2015-08-19 22:26:20 -05:00
The Gitter Badger
50901943fd Added Gitter badge 2015-08-20 03:21:09 +00:00
Zhang Xianyi
7df0820160 Use C kernels for s/dgemv on x86. 2015-08-19 08:07:47 -05:00
Zhang Xianyi
17ee2237c3 Fixed cmake bug with NO_LAPACK=1 2015-08-18 22:43:42 -05:00
Zhang Xianyi
4b7381b7a4 Merge pull request #617 from notaz/arm_fixes
really fix ARM64 locking
2015-08-17 15:22:37 -05:00
Grazvydas Ignotas
abade3f896 really fix ARM64 locking 2015-08-17 01:27:45 +02:00
Zhang Xianyi
d1349e7a11 Merge pull request #616 from notaz/arm_fixes
ARM fixes
2015-08-16 17:16:18 -05:00
Grazvydas Ignotas
3efeaed0d8 correct a minor mistake 2015-08-16 20:12:04 +02:00
Grazvydas Ignotas
d38a1ddc7a use real armv5 support
there is no more requirement for ARMv6 instructions,
and VFP on ARMv5 is uncommon
2015-08-16 18:59:18 +02:00
Grazvydas Ignotas
6b92204a7c add fallback blas_lock implementation
to be used on armv5 and new platforms
2015-08-16 18:59:17 +02:00
Grazvydas Ignotas
f2ac1a5cee set ARMV7 for Cortex-A9 and Cortex-A15
otherwise some macros like YIELDING are not defined correctly
2015-08-16 18:59:17 +02:00
Grazvydas Ignotas
e12cf1123e add fallback rpcc implementation
- use on arm, arm64 and any new platform
- use faster integer math instead of double
- use similar scale as rdtsc so that timeouts work
2015-08-16 18:59:16 +02:00
Grazvydas Ignotas
d3e2f0a1af add missing barriers
should fix issue #597
2015-08-16 15:37:02 +02:00
Grazvydas Ignotas
c2323dd4d2 really fix ARM locking
- was writing 0 to lock variable, so was ineffective
- only exit loop if both lock was 0 and strex was successful
2015-08-16 15:18:42 +02:00
Zhang Xianyi
f8eba3d548 Fixed cmake build bugs on Linux. 2015-08-11 16:25:16 -05:00
Zhang Xianyi
40ab5cfc50 Merge branch 'hpanderson_cmake' into cmake 2015-08-11 03:31:55 +08:00
Zhang Xianyi
b7a8f9ad47 Merge branch 'cmake' of https://github.com/hpanderson/OpenBLAS into hpanderson_cmake 2015-08-11 03:31:07 +08:00
Zhang Xianyi
f874465bb8 Use cmake to build OpenBLAS GENERIC Target on MSVC x86 64-bit.
Disable CBLAS and LAPACK.
2015-08-10 14:10:44 -05:00
Zhang Xianyi
bb6e050509 Merge pull request #614 from xantares/cmake_version
install OpenBLASConfigVersion.cmake
2015-08-06 13:15:51 -05:00
xantares
87336b9acf install OpenBLASConfigVersion.cmake 2015-08-06 20:03:50 +02:00
Hank Anderson
19664f3ef4 Added missing lapacke.cmake file. 2015-08-06 07:40:06 -05:00
Zhang Xianyi
c50661e5b7 Merge pull request #613 from fabioperez/develop
Add POWER7/POWER8 as targets
2015-08-05 09:19:17 -05:00
Fábio Perez
b8d64a856a Add POWER7/POWER8 as targets 2015-08-05 11:02:39 -03:00
Zhang Xianyi
898fc7552a Merge pull request #612 from ibmsoe/ppc64le
ppc64le platform support (ELF ABI v2)
2015-08-04 16:58:24 -05:00
Zhang Xianyi
ab0a0a75fc Merge branch 'develop' into cmake 2015-08-03 23:59:01 -05:00
Zhang Xianyi
1cf2b10224 Use pure C generic target on x86 and x86_64.
make TARGET=GENERIC

?gemm3m is unimplemented on generic target.
2015-08-03 23:55:56 -05:00
Zhang Xianyi
7ac7e147d4 Fixed cmake building bugs on Linux. Disable LAPACK by default. 2015-08-04 04:37:05 +08:00
Matthew Brandyberry
7ba4fe5afb ppc64le platform support (ELF ABI v2) 2015-07-21 22:20:19 -05:00
Zhang Xianyi
a55377e9a4 Merge branch 'hpanderson_cmake' into cmake 2015-07-22 04:07:27 +08:00
Zhang Xianyi
dcd5ba4443 Merge branch 'cmake' of https://github.com/hpanderson/OpenBLAS into hpanderson_cmake 2015-07-22 04:06:39 +08:00
Zhang Xianyi
3f1b57668e Fix blas lock bug on AArch64. 2015-06-26 11:54:41 +08:00
Zhang Xianyi
d8f18d32c3 Merge pull request #595 from tanderson92/fixTests
Fix test execution when USE_OPENMP=0
2015-06-22 21:54:51 -05:00
wernsaar
bdb5c842fc Merge pull request #596 from wernsaar/develop
optimizations for haswell
2015-06-13 16:44:48 +02:00
Werner Saar
e7c969e164 added optimized dtrmm_kernel for haswell 2015-06-13 16:16:29 +02:00
Werner Saar
9bd962f655 modified haswell parameter dgemm_unroll_n 2015-06-13 10:28:27 +02:00
Thomas Anderson
4f5691e5c0 Fix test execution when USE_OPENMP=0
The standard way to disable OpenMP support is to set USE_OPENMP=0,
as indicated by other checks to see if USE_OPENMP equals 1. The
problem is obviously then that `ifdef USE_OPENMP` is very much not
what we want to test for. This causes tests to fail when no OpenMP
library is installed.
2015-06-12 23:52:07 -07:00
Zhang Xianyi
29293160a4 Fix #593. Change MACOSX_DEPLOYMENT_TARGET to 10.6. 2015-06-08 10:53:50 -05:00
wernsaar
3e33afef2e Merge pull request #592 from wernsaar/develop
added benchmark scripts
2015-06-08 14:22:02 +02:00
Werner Saar
8614057ea9 added benchmark scripts for numpy, octave and R 2015-06-08 14:06:38 +02:00
Werner Saar
7f375f9e8f updated geev benchmark 2015-06-08 12:58:38 +02:00
wernsaar
69c5169e7d Merge pull request #589 from wernsaar/develop
small modification of gemm.c
2015-06-03 12:14:09 +02:00
Werner Saar
e19948baa1 small modification of gemm.c 2015-06-03 09:11:51 +02:00
wernsaar
a2eaf234fc Merge pull request #587 from wernsaar/develop
added gesv benchmark
2015-06-02 15:29:49 +02:00
Werner Saar
6a13a94e71 added gesv benchmark 2015-06-02 13:35:49 +02:00
wernsaar
eff43d3289 Merge pull request #585 from wernsaar/develop
bugfix for benchmark Makefile on MAC
2015-05-31 15:01:54 +02:00
Werner Saar
9c4817d07b bugfix for Makefile on mac 2015-05-31 14:16:51 +02:00
wernsaar
319f3a0451 Merge pull request #584 from wernsaar/develop
bugfixes, to build benchmarks with mingw on Windows OS
2015-05-29 13:27:20 +02:00
Werner Saar
02c7766f68 bugfixes, to build benchmarks with mingw on Windows OS 2015-05-29 12:56:22 +02:00
wernsaar
f38cb67ca8 Merge pull request #581 from wernsaar/develop
bugfix for arm locking
2015-05-23 12:58:15 +02:00
Werner Saar
eea2e30b74 bugfix for arm locking 2015-05-23 11:40:40 +02:00
Werner Saar
19b8fd2aed smp lock bugfix 2015-05-23 10:58:38 +02:00
wernsaar
0cc5212741 Merge pull request #580 from wernsaar/develop
added blas level1 swap  benchmark
2015-05-23 09:46:39 +02:00
Werner Saar
c47c8e8cf5 added blas level1 swap benchmark 2015-05-21 08:51:42 +02:00
Zhang Xianyi
a11555c715 Support Android NDK armeabi-v7a-hard ABI. (-mfloat-abi=hard)
e.g.
make HOSTCC=gcc CC=arm-linux-androideabi-gcc NO_LAPACK=1 TARGET=ARMV7

In Android NDK, it uses armeabi-v7a-hard ABI.
TARGET_CFLAGS += -mhard-float -D_NDK_MATH_NO_SOFTFP=1
TARGET_LDFLAGS += -Wl,--no-warn-mismatch -lm_hard
For more information, please check hard-float example at
android_ndk/tests/device/hard-float/jni/.
2015-05-20 21:57:27 -05:00
wernsaar
897d03518e Merge pull request #578 from wernsaar/develop
added blas level1 copy benchmark
2015-05-20 11:56:02 +02:00
Werner Saar
23fbc5728e added blas level1 copy benchmark 2015-05-20 11:05:00 +02:00
Zhang Xianyi
6d40fa587f Fix f_check bug. 2015-05-19 12:04:45 -05:00
wernsaar
22dcd79959 Merge pull request #577 from wernsaar/develop
Bugfix for armv6 memory barrier
2015-05-19 10:59:24 +02:00
Werner Saar
ea4df0aad3 Ref #574: Bugfix for armv6 memory barrier 2015-05-19 10:43:12 +02:00
Zhang Xianyi
e127fb8fd8 1) Refs #575. Remove g77 from compiler list.
2) If OpenBLAS cannot find Fortran compiler, it will only build BLAS
(without LAPACK).
2015-05-19 00:01:04 -05:00
wernsaar
7fb718a7d8 Merge pull request #572 from wernsaar/develop
added optimized cscal and zscal functions for steamroller
2015-05-18 13:47:38 +02:00
Werner Saar
24f58c8bb1 added optimized cscal and zscal kernels for steamroller 2015-05-18 12:40:07 +02:00
Werner Saar
95b1faf667 added optimized cscal and zscal kernels for steamroller and piledriver 2015-05-18 10:50:57 +02:00
Werner Saar
2d9e406050 added optimized cscal kernel for sandybridge 2015-05-18 08:46:06 +02:00
Werner Saar
59083e3ce1 added optimized cscal kernel for bulldozer 2015-05-18 07:33:52 +02:00
wernsaar
685be40339 Merge pull request #571 from wernsaar/develop
added optimized cscal and zscal functions
2015-05-17 14:09:14 +02:00
Werner Saar
31c9e399e9 added optimized cscal kernel for haswell 2015-05-17 13:44:09 +02:00
Werner Saar
7de6bb9889 added optimized zscal kernel for bulldozer 2015-05-17 11:45:19 +02:00
Werner Saar
d63034303b added optimized zscal kernel for haswell 2015-05-16 16:41:45 +02:00
Zhang Xianyi
51ff17d46e Add AMD Excavator target. 2015-05-13 16:16:30 -05:00
wernsaar
905534942a Merge pull request #568 from wernsaar/develop
added optimized dscal kernel
2015-05-13 13:48:08 +02:00
Werner Saar
18e90ee2e3 bugfix: added static to functions 2015-05-13 13:31:26 +02:00
Werner Saar
e00cccc41e added optimized dscal kernel for piledriver 2015-05-13 13:05:35 +02:00
Werner Saar
73f09bf64f optimized dscal kernel for increment != 1 2015-05-13 12:14:39 +02:00
Werner Saar
02e772c7e4 added optimized dscal kernel for haswell 2015-05-12 17:19:58 +02:00
Werner Saar
7aee913991 added optimized dscal kernel for sandybridge 2015-05-12 16:27:43 +02:00
Werner Saar
e50a933037 added optimized dscal kernel for bulldozer 2015-05-12 12:28:44 +02:00
Zhang Xianyi
5f9011d6ef Merge pull request #566 from powderluv/develop
Fix build with ALLOC_SHM=0 (Android NDK)
2015-05-11 20:59:12 -05:00
powderluv
ebb9eba987 Fix build with ALLOC_SHM=0 (Android NDK)
Refactor such that you can build with ALLOC_SHM=0. HughTLB
implicity depends on ALLOC_SHM=1. This patch allows
building for Android NDK r10d.
2015-05-10 00:10:26 -07:00
Zhang Xianyi
8e5a1083bb Refs #532. Improve gemv paralel with small m and large n case.
Splite the matrix and reduction.
2015-05-08 05:33:17 +08:00
Zhang Xianyi
6743beb748 Refs #565. Fix the bug of generate FEXTRALIB. 2015-05-07 13:06:53 +08:00
Zhang Xianyi
bcabf72c08 Refs #565. Merge branch 'andreasnoack-anj/bench' into develop 2015-05-07 12:52:14 +08:00
Andreas Noack
cda29f183b Add vecLib benchmarks 2015-05-06 21:52:34 -04:00
wernsaar
e52d36450a Merge pull request #564 from wernsaar/develop
Use only 1 thread in trsm if m or n < 2*GEMM_MULTITHREAD_THRESHOLD
2015-05-06 11:10:31 +02:00
Werner Saar
f8f2e261fe use only 1 thread if m or n < 2*GEMM_MULTITHREAD_THRESHOLD 2015-05-06 10:41:53 +02:00
Werner Saar
be3c843700 added loops to trsm.c 2015-05-06 09:21:19 +02:00
wernsaar
e6f57db846 Merge pull request #563 from wernsaar/develop
Bugfix for gemm3m tests
2015-05-05 12:13:35 +02:00
Werner Saar
9bfd267d51 bugfix for gemm3m tests 2015-05-05 11:58:59 +02:00
Werner Saar
924bc5372e removed gemm3m functions from normal checks 2015-05-05 11:39:43 +02:00
wernsaar
2b83a69650 Merge pull request #561 from wernsaar/develop
updated dgemv_n sgemv_n kernels
2015-05-04 11:11:13 +02:00
Werner Saar
133c11a156 updated dgemv_n kernel for nehalem 2015-04-30 14:38:06 +02:00
Werner Saar
30f52d53df optimized dgemv_n kernel for haswell 2015-04-30 12:11:39 +02:00
Zhang Xianyi
a124637329 Merge pull request #560 from sebastien-villemot/develop
Fix detection of ARM architectures in c_check.
2015-04-29 11:36:47 -05:00
Sébastien Villemot
642aaba2e0 Fix detection of ARM architectures in c_check.
This is necessary to avoid the false detection of a cross-compiling environment.
2015-04-29 18:14:21 +02:00
wernsaar
4c616173e4 Merge pull request #558 from wernsaar/develop
optimizations for sandybridge
2015-04-28 17:30:16 +02:00
Werner Saar
5e83d80725 optimized dger kernel for sandybridge 2015-04-28 16:58:11 +02:00
Werner Saar
b2e1797dc6 added optimized sger kernel for sandybridge 2015-04-28 15:33:38 +02:00
Werner Saar
e216f686cb optimized saxpy and daxpy for sandybridge 2015-04-28 10:18:32 +02:00
Zhang Xianyi
e42652f772 Merge pull request #554 from wernsaar/develop
added benchmarks for zgeru and cgeru
2015-04-25 08:11:36 -05:00
Werner Saar
e77db2af31 add benchmarks for zgeru and cgeru 2015-04-25 14:53:07 +02:00
Zhang Xianyi
37b00841ac Merge pull request #552 from jeromerobert/develop
gemv: Ensure stack buffer is large enough to handle memory alignment
2015-04-24 14:12:12 -05:00
Werner Saar
fc0e0391f3 bugfixes: replaced int with BLASLONG 2015-04-24 14:30:44 +02:00
wernsaar
da0f27b9ac Merge pull request #553 from wernsaar/develop
optimized some blas level1 kernels for increments != 1
2015-04-24 13:57:48 +02:00
Werner Saar
c22068c406 optimized sdot.c for increments != 1 2015-04-24 13:13:20 +02:00
Werner Saar
dee100d0e4 optimized saxpy.c for increments != 1 2015-04-24 11:52:59 +02:00
Werner Saar
0273966abb optimized daxpy kernel for increments != 1 2015-04-24 11:39:17 +02:00
Werner Saar
3a67daa954 optimized ddot.c for increments != 1 2015-04-24 10:56:55 +02:00
Jerome Robert
ab567d8443 gemv: Ensure stack buffer is large enough to handle memory alignment
Ref #478
2015-04-24 10:12:49 +02:00
wernsaar
3c09cea4b2 Merge pull request #550 from wernsaar/develop
added optimized ssymv kernels for haswell and sandybridge
2015-04-23 13:27:38 +02:00
Werner Saar
b4f2153dcd added optimized ssymv kernels for sandybridge 2015-04-23 12:19:24 +02:00
Werner Saar
1c4b0eeae3 added optimized ssymv kernels for haswell 2015-04-23 10:23:13 +02:00
wernsaar
406d9d64e9 Merge pull request #549 from wernsaar/develop
added optimized dsymv kernels for haswell and sandybridge
2015-04-22 12:36:13 +02:00
Werner Saar
1bec9abb9a added optimized dsymv kernels for sandybridge 2015-04-22 12:09:43 +02:00
Werner Saar
3814bf60d3 added optimized dsymv kernels for haswell 2015-04-22 10:42:50 +02:00
Zhang Xianyi
847e19c04e Refs #478,#482, Enable stack alloc for s/dgemv_t.(revert 9798491) 2015-04-20 23:22:40 -05:00
Werner Saar
46c7b4d5c8 added asum benchmark 2015-04-19 11:24:07 +02:00
Werner Saar
8e05d291b5 added scal benchmark 2015-04-18 08:41:41 +02:00
wernsaar
9da555e5f7 Merge pull request #546 from wernsaar/develop
added optimized zaxpy-kernels
2015-04-16 11:36:51 +02:00
Werner Saar
6d0db0151f added optimized zaxpy-kernels 2015-04-16 11:19:37 +02:00
Zhang Xianyi
37b9033c90 Merge pull request #543 from jeromerobert/develop
Fix a buffer overflow with MAX_STACK_ALLOC size in dgemv_t
2015-04-15 11:18:14 -05:00
wernsaar
59e7a518c6 Merge pull request #544 from wernsaar/develop
Optimized  caxpy-kernels
2015-04-15 17:04:02 +02:00
Werner Saar
13889515b3 added optimized caxpy-kernel for sandybridge 2015-04-15 16:29:25 +02:00
Werner Saar
248c9340c3 added optimized caxpy-kernel for haswell 2015-04-15 15:16:31 +02:00
Werner Saar
e9f33b4ca7 added optimized caxpy-kernel for steamroller 2015-04-15 13:49:23 +02:00
Werner Saar
f5d847122a updated caxpy_microk_bulldozer-2.c and caxpy.c 2015-04-15 11:59:38 +02:00
Jerome Robert
a4c96eca67 Fix a buffer overflow with MAX_STACK_ALLOC size in dgemv_t
Refs #478, #482, 9798481, fd9fd42
2015-04-15 11:46:48 +02:00
wernsaar
fb02cb0a41 Merge pull request #540 from wernsaar/develop
Optimized dot- and axpy-kernels
2015-04-14 15:53:09 +02:00
Werner Saar
baa0363ea2 add optimized ddot-kernel for piledriver 2015-04-14 15:09:13 +02:00
Werner Saar
34ba66606a add optimized daxpy-kernel for piledriver 2015-04-14 14:23:29 +02:00
Werner Saar
f615dc7603 added optimized saxpy kernel for steamroller 2015-04-14 09:09:39 +02:00
Werner Saar
331c417637 optimized saxpy for piledriver 2015-04-14 08:34:11 +02:00
Zhang Xianyi
6c3a0b5d46 Enable MAX_STACK_ALLOC by default. 2015-04-13 23:23:40 -05:00
Zhang Xianyi
fd9fd42936 Refs #478, #482. Fixed bug on previous commit. 2015-04-13 23:22:27 -05:00
Zhang Xianyi
9798481979 Refs #478, #482. Fix segfault bug for gemv_t with MAX_ALLOC_STACK flag.
For gemv_t, directly use malloc to create the buffer.
2015-04-13 19:45:27 -05:00
Werner Saar
d7a17ad85d optimized sdot-kernel for pilediver 2015-04-13 13:19:21 +02:00
Werner Saar
d35f6c63c2 add optimized daxpy-kernel for steamroller 2015-04-13 12:22:43 +02:00
Werner Saar
166d76e864 added optimized sdot-kernel for steamroller 2015-04-11 08:48:18 +02:00
Werner Saar
f9f127d838 added optimized ddot kernel for steamroller 2015-04-10 16:18:03 +02:00
wernsaar
62231ab337 Merge pull request #538 from wernsaar/develop
Added optimized cdot- and zdot-kernels
2015-04-10 16:03:37 +02:00
Werner Saar
3119def9a7 updated cdot and zdot 2015-04-10 11:10:31 +02:00
Werner Saar
33b332372a add optimized cdot- and zdot-kernel for sandybridge 2015-04-10 09:37:26 +02:00
Werner Saar
fd838c75bc add optimized cdot- and zdot-kernel for haswell 2015-04-09 15:13:52 +02:00
Werner Saar
b57a60dac8 updated cdot and zdot for piledriver 2015-04-09 10:33:46 +02:00
Werner Saar
5c51163972 added optimized cdot- and zdot-kernel for steamroller 2015-04-09 09:45:23 +02:00
Werner Saar
9299d8cfd6 added optimized cdot- and zdot-kernels for bulldozer 2015-04-08 16:29:55 +02:00
Zhang Xianyi
0a3d3b945d Refs #535. Fix the wrong vector instruction in sgemm sandy bridge kernel. 2015-04-08 03:55:49 +08:00
Zhang Xianyi
4f680a7d61 Merge pull request #534 from wernsaar/develop
Refs #533. added optimized saxpy- and daxpy-kernel for haswell and sandybridge
2015-04-07 12:48:11 -05:00
Werner Saar
ba926e807c added cdot- and zdot benchmark 2015-04-07 11:56:06 +02:00
Werner Saar
60c6dec6e6 updated some lines for bulldozer 2015-04-06 18:47:16 +02:00
Werner Saar
47898cca35 added optimized saxpy- and daxpy-kernel for sandybridge 2015-04-06 16:05:16 +02:00
Werner Saar
53bb924287 added optimized saxpy- and daxpy-kernel for haswell 2015-04-06 12:33:16 +02:00
Zhang Xianyi
1e80b8b0d3 Merge pull request #531 from wernsaar/develop
added optimized sdot- and ddot-kernels for Haswell and Sandybridge
2015-04-05 16:42:39 -05:00
Werner Saar
a901b065d3 added optimized ddot-kernel for sandybridge 2015-04-05 20:19:38 +02:00
Werner Saar
3937e2a0a0 add optimized sdot-kernel for sandybridge 2015-04-05 19:47:05 +02:00
Werner Saar
9707d608d5 removed double definition line 2015-04-05 18:35:34 +02:00
Werner Saar
701b9d7556 added optimized sdot- and ddot-kernel for HASWELL 2015-04-05 17:57:53 +02:00
Zhang Xianyi
8977b3f235 Refs #529. Support Intel Broadwell by Haswell kernels. 2015-04-02 11:08:03 -05:00
Zhang Xianyi
f6426395ea Merge pull request #527 from xantares/patch-1
fix mingw install
2015-03-30 10:16:11 -05:00
xantares
0ac787eefe fix mingw install 2015-03-30 09:30:55 +02:00
Zhang Xianyi
e5b96e55a7 Fix build bug for ARM64. 2015-03-24 15:27:17 -05:00
Zhang Xianyi
d0c51c4de9 Merge branch 'develop' 2015-03-24 15:07:07 -05:00
Zhang Xianyi
a3491e1e88 Update the doc for 0.2.14. 2015-03-24 15:05:59 -05:00
Zhang Xianyi
e81a5d61e4 Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2015-03-24 12:17:12 -05:00
Zhang Xianyi
c674fa32be Add ARM targets. 2015-03-24 12:17:04 -05:00
Zhang Xianyi
e34911a73d Fix compiling bug for ARM with setting BINARY. 2015-03-24 17:15:33 +00:00
Zhang Xianyi
76dcaf2281 Merge pull request #521 from maxlevesque/patch-1
Correct typo /proc/ instead of /pros/
2015-03-21 12:26:35 -05:00
Maximilien Levesque
770fac92eb Correct typo /proc/ instead of /pros/ 2015-03-20 23:25:11 +01:00
Zhang Xianyi
e95d64333a Refs #519. Avoid calling strncpy. 2015-03-19 15:57:22 -05:00
Zhang Xianyi
75c40bcc48 Refs #520. Fixed ONLY_CBLAS=1 compiling bug on OSX. 2015-03-19 11:52:09 -05:00
Zhang Xianyi
b62f9f4120 Merge pull request #518 from ton/issue-508
Fix issue #508
2015-03-18 13:00:07 -05:00
Ton van den Heuvel
b6438dedea Fix issue #508
Fix race condition during shutdown causing a crash in
gotoblas_set_affinity().
2015-03-18 13:22:43 +01:00
Hank Anderson
1d183dcda8 Added lapacke sources. 2015-02-25 16:51:08 -06:00
Zhang Xianyi
cdefdb21cd Refs #492. Fixed c/zsyr bug with negative incx. 2015-02-26 06:37:03 +08:00
Hank Anderson
e19bf3a28b Removed MSVC cpuid func when using clang. 2015-02-25 14:44:49 -06:00
Hank Anderson
3649cfbd7b Fixed EPILOGUE for clang. 2015-02-25 12:23:26 -06:00
Hank Anderson
5ae8993752 Added intrinsics for MSVC. 2015-02-25 11:52:51 -06:00
Hank Anderson
84d90d6ed8 Fixed some compiler errors/warnings for clang. 2015-02-25 11:52:25 -06:00
Hank Anderson
518e2424a8 Fixed bad filename for cpuid.S compile. 2015-02-25 11:51:29 -06:00
Zhang Xianyi
ea7f9dacf4 Refs #509. Fixed geadd building bug with DYNAMIC_ARCH=1. 2015-02-26 01:47:11 +08:00
Zhang Xianyi
bf5dbb7e2a Refs#509. Merge branch 'grisuthedragon-develop' into develop 2015-02-26 01:44:19 +08:00
Hank Anderson
00e373aea6 Added LAPACK sources directly to add_library call instead of OBJECT. 2015-02-25 10:18:18 -06:00
Hank Anderson
9eaea02f33 Added additional gemm defines for complex types. 2015-02-25 09:39:11 -06:00
Hank Anderson
ab7043373f Fixed bug generating trmv complex source names. 2015-02-24 15:18:41 -06:00
Hank Anderson
504cdb10ed Added check for MSVC before enabling fortran.
Currently forcing gfortran, instead of assuming ifort.
2015-02-24 14:31:45 -06:00
Hank Anderson
a8002b0c5f Separated getarch ASM file when using MSVC. 2015-02-24 14:31:18 -06:00
Hank Anderson
0553476fba Added TRANS defines for complex sources in lapack. 2015-02-24 14:30:35 -06:00
Hank Anderson
2416d9dbac Fixed TRANSA defines for complex sources in driver/level2. 2015-02-24 13:18:07 -06:00
Hank Anderson
0d8e227ea7 Changed strategy for setting preprocessor definitions.
Instead of generating separate object files for each permutation of
defines for a source file, GenerateNamedObjects now writes an entirely
new source file and inserts the defines as #define c statements.

This solves a problem I ran into with ar.exe where it was refusing to
link objects that had the same filename despite having different paths.
2015-02-24 12:26:33 -06:00
Hank Anderson
12d1fb2e40 Fixed incorrect object name in kernel CMakeLists.txt 2015-02-24 10:30:16 -06:00
Hank Anderson
1b7f427401 Added conj gemv objects for complex build. 2015-02-23 10:24:31 -06:00
Hank Anderson
b2284647a3 More complex objects. 2015-02-23 07:51:05 -06:00
Hank Anderson
a6116e5859 Added some more complex-only objects. 2015-02-22 17:49:28 -06:00
Hank Anderson
fb5d5bb971 Added defines for complex trmv. 2015-02-21 12:39:03 -06:00
Hank Anderson
371071d461 Added CONJ defines for trmm/trsm. 2015-02-21 10:59:02 -06:00
Hank Anderson
8a143516e3 Added alternate_name to a couple of the name mangling schemes.
Added zherk_k sources to driver/level3.
2015-02-20 17:03:33 -06:00
Hank Anderson
e5897ecb9b Added zherk_kernel.c objects to driver/level3. 2015-02-19 16:19:56 -06:00
Hank Anderson
714638c187 Added some TRMM objects for complex types. 2015-02-19 16:11:51 -06:00
Hank Anderson
e27c372e53 Fixed reuse of float_char from parent loop.
Fixed in/it/on/otcopy names.
2015-02-19 13:53:29 -06:00
Hank Anderson
f3f2b3d768 Added complex and single netlib-lapack fortran sources to lapack.cmake. 2015-02-19 12:26:11 -06:00
Hank Anderson
9492298048 Added other float types to Makefile.L3. 2015-02-18 13:01:05 -06:00
Hank Anderson
43725b82c5 ParseMakefileVars now replaces Makefile vars with CMake vars. 2015-02-18 12:23:17 -06:00
Hank Anderson
14fd3d35de Added checks for missing defines in kernel. 2015-02-18 10:25:01 -06:00
Hank Anderson
cebc07cebd ParseMakefileVars now recursively parses included makefiles. 2015-02-17 22:09:41 -06:00
Hank Anderson
33c5e8db7f Added a helper function for setting the L1 kernel defaults.
Added loop to build objects with different KERNEL defines.
2015-02-17 21:36:23 -06:00
Hank Anderson
67e39bd8fb Added mangled complex filenames to interface and lapack CMakeLists.txt. 2015-02-17 13:12:30 -06:00
Hank Anderson
9eb1499095 Added another param to GenerateNamedObjects to mangle complex source names.
There are a lot of sources for complex float types that are the same
names as the real sources, except with z prepended.
2015-02-17 10:30:28 -06:00
Martin Koehler
39cc6b21d3 Add ATLAS-style ?geadd function 2015-02-16 13:46:20 +01:00
Hank Anderson
4662a0b13a Changed generate functions to iterate through a list of float types.
This will generate obj files for SINGLE/DOUBLE/COMPLEX/DOUBLE COMPLEX.
2015-02-15 17:44:37 -06:00
Hank Anderson
e74462a3f5 Moved declarations to start of functions to satisfy MSVC C89 implementation. 2015-02-11 11:16:57 -06:00
Hank Anderson
056ba26755 Changed a number of inline calls to use __inline.
MSVC doesn't inmplement C99, so can't use the inline keyword. __inline
appears to work in MSVC and GCC.
2015-02-11 11:13:17 -06:00
Hank Anderson
a0d9a7fd83 Changed _Complex types in common_level1.h to use the typedef. 2015-02-11 11:12:14 -06:00
Hank Anderson
5d3fc092e9 Added MSVC defines to common.h.
Don't have unistd.h in MSVC.

Chagned YIELDING to use the YeildProcessor macro.
2015-02-11 11:10:45 -06:00
Hank Anderson
c94fe71278 Removed incoming-stack-boundary for MSVC.
Made float type optional for GenerateNamedObjects.

Called GenerateNamedObjects for a couple of driver/others files that
needed NAME/CNAME set.
2015-02-11 10:54:14 -06:00
Hank Anderson
d60b49e5c5 Turned off uninizialized variable warning when compiling lapack-netlib. 2015-02-10 14:36:43 -06:00
Hank Anderson
64b5a0ef84 Added AUX files from lapack-netlib. 2015-02-10 14:29:05 -06:00
Hank Anderson
162791e30e Added common objects from kernel Makefile. 2015-02-10 12:42:05 -06:00
Hank Anderson
8743093bd7 Added aux files from lapack-netlib. 2015-02-10 11:47:46 -06:00
Hank Anderson
96cf6779ca Added DLA sources from lapack-netlib.
Can't use the lapack-netlib cmake files, since they are designed to
build a complete lapack/blas library. They have their own fortran
detection and flag setup and so on. Instead I'll just recreate the
makefiles I need.

Fixed a typo in the NAME defines.
2015-02-10 11:01:01 -06:00
Hank Anderson
3b20b62423 Fixed trti2 name. 2015-02-09 15:29:28 -06:00
Hank Anderson
6ddbfea700 Added generic laswp object. 2015-02-09 15:15:58 -06:00
Hank Anderson
c0624a26be Fixed some dgemm_copy function names. 2015-02-09 14:34:29 -06:00
Hank Anderson
4bfaf1ce66 Removed some list appends I missed. 2015-02-09 12:56:55 -06:00
Hank Anderson
e8c39138c6 Removed return value from GenerateNamedObjects.
It sets DBLAS_OBJS directly to save a bunch of list appending in the
CMakeLists.txt files.
2015-02-09 12:28:09 -06:00
Hank Anderson
f992799226 Added the rest of Makefile.L3. 2015-02-09 10:47:35 -06:00
Hank Anderson
4c65afcce1 Changed kernel filenames to vars. These will need to be read from KERNEL.
Added some kernel/L3 objects.
2015-02-09 09:52:14 -06:00
Hank Anderson
7fa5c4e2fd Fixed some case issues with ARCH.
Added some kernel and driver/others objects.
2015-02-08 15:29:18 -06:00
Zhang Xianyi
771b18ae9c Detect the wrong combined flags of USE_OPENMP=1 and USE_THREAD=0. 2015-02-08 01:42:48 -06:00
Zhang Xianyi
cfa9392ffa Fix openblas_get_num_threads and openblas_get_num_procs bug with single thread. 2015-02-08 01:30:23 -06:00
Hank Anderson
fa0e6a6c93 Added the rest of the L1 kernel makefile. 2015-02-07 21:37:46 -06:00
Hank Anderson
2f59135eb6 Added gemv to level2 CMakeLists.txt. 2015-02-07 21:15:21 -06:00
Hank Anderson
38681fb1c6 Added more kernel files. 2015-02-07 12:54:30 -06:00
Hank Anderson
6b5d26e07b Added SMP sources to level2 CMakeLists.txt. 2015-02-06 16:52:19 -06:00
Hank Anderson
13d2d48e67 Added yet another naming scheme for lapack functions. 2015-02-06 13:42:20 -06:00
Hank Anderson
189fadfde0 Started implementing kernel/Makefile in cmake. 2015-02-05 21:05:11 -06:00
Hank Anderson
627d5e7401 Added SMP objects to driver/level3. 2015-02-05 12:22:48 -06:00
Hank Anderson
943fa2fb58 Fixed object names in level2. 2015-02-05 10:49:11 -06:00
Hank Anderson
1b62a4f3c9 Changed some function parameters to optional. 2015-02-05 09:39:40 -06:00
Hank Anderson
461e691127 Codes when define is absent are now a parameter to AllCombinations.
The level3 object names should now be correct.
2015-02-05 09:23:47 -06:00
Hank Anderson
cfaf1c678f Added option to append define codes with an underscore.
Fixed the code array not getting reset on subsequent AllCombinations
calls.
2015-02-05 09:17:18 -06:00
Hank Anderson
0d7bad1f35 Changed GenerateObjects to append combination codes (e.g. dtrmm_TU). 2015-02-05 09:02:54 -06:00
Hank Anderson
373a1bdadb Converted lapack/Makefile to cmake. 2015-02-04 15:47:10 -06:00
Hank Anderson
2828f6630c Added SMP sources to COMMONOBJS. 2015-02-04 14:01:36 -06:00
Hank Anderson
58cff2fed8 Added CBLAS define/naming convention to GenerateNamedObjects. 2015-02-04 11:30:15 -06:00
Hank Anderson
5690cf3f0e Added override for function names in GenerateNamedObjects.
The BLAS interface folder should now be generated the correct objects
for the DOUBLE case.
2015-02-04 10:52:19 -06:00
Hank Anderson
a0aeda6187 Added function to set defines for the object names (e.g. -DNAME=dgemm). 2015-02-04 10:37:34 -06:00
Zhang Xianyi
1ccd57ce80 Merge pull request #497 from eschnett/develop
Introduce openblas_get_num_threads and openblas_get_num_procs
2015-02-03 23:09:38 -06:00
Hank Anderson
84b3d760c4 Converted rest of Makefile.system to system.cmake. 2015-02-03 16:05:01 -06:00
Hank Anderson
0beea3a5a5 Converted LAPACK flags from Makefile.system. 2015-02-03 15:33:56 -06:00
Hank Anderson
560c96a9a7 Fixed newlines in some cmake files. 2015-02-03 15:11:15 -06:00
Hank Anderson
0ccfa60a53 Changed fortran compiler name to be uppercase and stripped of path/ext. 2015-02-03 15:09:37 -06:00
Hank Anderson
30be551502 Corrected fortran compiler name variables.
Fixed some typos.

Updated c_check to set ARCH and BINARY64/32.

Added version variables.
2015-02-03 14:21:22 -06:00
Hank Anderson
be1ce38f24 Fixed some missing parentheses. 2015-02-03 14:00:29 -06:00
Hank Anderson
e818ace11a Ported more of Makefile.system to CMake. 2015-02-03 13:34:41 -06:00
Hank Anderson
e4bfbd8258 Added fc.cmake (forgot it in last commit).
Moved a couple C compiler ifs from Makefile.system into cc.cmake.
2015-02-03 13:08:59 -06:00
Hank Anderson
2d5b442f5b Ported Fortran configuration code from Makefile.system to fc.cmake. 2015-02-03 12:32:23 -06:00
Hank Anderson
af11aff309 Ported C compiler settings from Makefile.system into new cmake file. 2015-02-03 12:00:49 -06:00
Hank Anderson
e66aa5f3b7 Ported arch dependent settings from Makefile.system to new cmake file. 2015-02-03 11:32:20 -06:00
Erik Schnetter
65a847cd36 Introduce openblas_get_num_threads and openblas_get_num_procs 2015-02-03 12:23:41 -05:00
Hank Anderson
31cf22cb4b Ported OS settings from Makefile.system into new cmake file. 2015-02-03 11:07:58 -06:00
Hank Anderson
20e593a44a Added cblas_ objects to interface CMakeLists.
Naming isn't right, though, not seeing cblas_xxxx exports in the
resulting library.
2015-02-02 16:25:30 -06:00
Hank Anderson
7194424fef Added missing common objects to the library. 2015-02-02 15:21:29 -06:00
Hank Anderson
d11bde60d0 DOUBLE define for DBLAS objects is now set in main CMakeLists.txt.
Since the objects are the same, could generate SINGLE/COMPLEX/etc here
without having to rewrite all the object enumeration code again.
2015-02-02 15:00:44 -06:00
Hank Anderson
9e154aba58 Added LAPACK object files to interface CMakeLists. 2015-02-02 12:31:15 -06:00
Hank Anderson
5057a4b4df Added openblas add_library call that uses DBLAS_OBJS ojbects. 2015-01-30 15:21:21 -06:00
Hank Anderson
3e8ea7a351 Added COMMONOBJS to driver/others CMakeLists.txt. 2015-01-30 14:06:14 -06:00
Hank Anderson
d3dcdddf75 Moved functions into util cmake file. 2015-01-30 13:47:40 -06:00
Hank Anderson
e5e7595bf9 Added paramater to GenerateObjects for defines that affect all sources. 2015-01-30 13:31:13 -06:00
Hank Anderson
7693887d61 Added empty set to the combinations generated by AllCombinations. 2015-01-30 13:01:11 -06:00
Hank Anderson
8d9b196e0d Moved loop over define combos into a function.
This function takes a set of sources and a set of preprocessor
definitions. It will iterate over the sources and build an object
file for each combination of preprocessor definitions for each
source file.
2015-01-30 12:14:44 -06:00
Hank Anderson
a6cf8aafc0 Updated level3/CMakeLists with correct defines using all combos. 2015-01-30 11:21:50 -06:00
Hank Anderson
dbdca7bf0c Added first pass at driver/level3 Makefile conversion.
Added a rather convoluted CMake function to find all combinations
of a given list. This will be useful for the object files that are
compiled multiple times with different combinations of preprocessor
definitions.
2015-01-29 22:53:11 -06:00
Hank Anderson
dabaecb2bc Moved getarch parsing code into a function. 2015-01-29 09:30:47 -06:00
Zhang Xianyi
07ff001981 Merge pull request #495 from jeromerobert/develop
Fix a segfault in gemv when MAX_STACK_ALLOC is set
2015-01-29 18:23:50 +08:00
Jerome Robert
b17ccb4c5c Fix a segfault in gemv when MAX_STACK_ALLOC is set
* stack_alloc_size is needed after the implementation call
but it may be overwritten if it's optimized to a register,
because some gemv implementation (ex: dgemv_n.S) do not
restore all register (ex: r10).
* do the same in ger.c for the same reasons even if the bug
has not been observed.
2015-01-29 09:55:57 +01:00
Hank Anderson
8c23965da3 prebuild.cmake now reads the output from getarch into CMake vars. 2015-01-28 22:57:44 -06:00
Hank Anderson
61f21b5d03 getarch_2nd now appends its output to config.h/config_kernel.h 2015-01-28 22:20:15 -06:00
Hank Anderson
8ede4a8da4 getarch now compiles and sets config.h defines properly.
Still isn't parsed into CMake variables, and getarch_2 needs to
get the same treatment.
2015-01-28 17:18:26 -06:00
Hank Anderson
1c5b6bb4f7 Added CORE define to config.h in prebuild.cmake (temporarily). 2015-01-28 16:33:48 -06:00
Hank Anderson
c5f5c7a076 Updated c_check OS/compiler/bits detection. 2015-01-28 15:47:47 -06:00
Hank Anderson
9a508abdc7 Added first pass at driver/level2 makefile conversion. 2015-01-28 14:52:15 -06:00
Hank Anderson
5eefe18ae4 Added CMakeLists.txt for the first of the BLAS folders.
It only does the double precision compile currently.

I realized I didn't finish converting Makefile.system yet, so I made
a note of that.
2015-01-27 16:17:17 -06:00
Hank Anderson
1e8bb0e0e0 Fixed architecture detection when AMD64 in c_check. 2015-01-27 14:03:46 -06:00
Hank Anderson
864b8b31de Fixed incorrect case in OS_ definition in c_check. 2015-01-27 13:54:29 -06:00
Hank Anderson
d2d15e522f Started converting lib target to CMake.
The main part of this target is looping through the BLAS subfolders
and calling make on them. Need to add CMakeLists.txt for each of these
subfolders.
2015-01-27 12:23:35 -06:00
Hank Anderson
f4d1e7a265 Hardcoded NUM_CORES to get system.cmake working. 2015-01-27 11:37:39 -06:00
Zhang Xianyi
63c6fcfa0a Merge pull request #490 from eschnett/develop
Move #include statements outside extern "C" blocks
2015-01-13 15:43:56 +08:00
Erik Schnetter
29cb47fc06 Move #include statements outside extern "C" blocks 2015-01-12 21:27:52 -05:00
Zhang Xianyi
4e6c4046f7 Fix cortex-a15 detecting bug. 2015-01-12 09:35:16 +00:00
Zhang Xianyi
229ce2ccd1 Add cortex-a9 and cortex-a15 targets. 2015-01-12 08:55:29 +00:00
Zhang Xianyi
ef75be0e51 Merge pull request #487 from kortschak/dromtg-test
Add test for drotmg bug fixed by 692b14c
2015-01-07 14:13:11 +08:00
kortschak
5344f335a8 Add test for drotmg bug fixed by 692b14c
Test requested in issue xianyi/OpenBLAS#484.

Run tests by applying the following change and then make:

	diff --git a/Makefile.rule b/Makefile.rule
	index bea1fe1..9852ff3 100644
	--- a/Makefile.rule
	+++ b/Makefile.rule
	@@ -140,7 +140,7 @@ NO_AFFINITY = 1

	-# UTEST_CHECK = 1
	+UTEST_CHECK = 1
2015-01-07 10:06:55 +10:30
Hank Anderson
0f6bec0a32 cmake.prebuild now compiles getarch.
Doesn't actually run it yet.
2015-01-01 21:03:17 -06:00
Hank Anderson
92cdac5f87 Added MSVC functions to cpuid_x86.c to replace gcc-specific ASM. 2015-01-01 21:02:48 -06:00
Hank Anderson
1a41022e3e Added MSVC defines to cpuid.h and getarch.c. 2015-01-01 21:01:28 -06:00
Zhang Xianyi
5cb5af9333 Add configuration options. 2015-01-02 02:42:32 +08:00
Zhang Xianyi
41aad0407f Merge pull request #482 from jeromerobert/develop
Allow to do gemv and ger buffer allocation on the stack
2015-01-02 02:26:17 +08:00
Hank Anderson
e5c47e44f6 First pass at converting a few makefiles to CMake. 2014-12-30 21:53:00 -06:00
Zhang Xianyi
f8f2e84659 Merge pull request #486 from wernsaar/develop
Optimizations for steamroller
2014-12-31 02:36:23 +08:00
Werner Saar
34633fef01 Merge branch 'develop' of github.com:wernsaar/OpenBLAS into develop 2014-12-30 20:16:53 +08:00
Werner Saar
ddf983d643 added optimizations for steamroller 2014-12-30 20:14:45 +08:00
Zhang Xianyi
17b9db20f1 Merge pull request #483 from wernsaar/develop
added Steamroller as a  cpu target
2014-12-29 12:00:16 +08:00
Werner Saar
0dc559ed30 bugfix in dynamic.c 2014-12-28 17:15:42 +01:00
Werner Saar
9566f5fdb0 added Steamroller as a target processor 2014-12-28 13:45:19 +01:00
Werner Saar
4319769b79 added target processor STEAMROLLER 2014-12-28 20:16:46 +08:00
Jerome Robert
e9d9a8eae3 Allow to do gemv and ger buffer allocation on the stack
ger and gemv call blas_memory_alloc/free which in their turn
call blas_lock. blas_lock create thread contention when matrices
are small and the number of thread is high enough. We avoid
call blas_memory_alloc by replacing it with stack allocation.
This can be enabled with:
make -DMAX_STACK_ALLOC=2048
The given size (in byte) must be high enough to avoid thread contention
and small enough to avoid stack overflow.

Fix #478
2014-12-27 14:33:12 +01:00
Zhang Xianyi
cbb3ab80e7 Merge pull request #481 from eschnett/develop
Correct ilaver C declaration
2014-12-26 10:09:19 +08:00
Erik Schnetter
cd9868b1b4 Correct ilaver C declaration 2014-12-25 17:41:17 -05:00
Zhang Xianyi
eb738148fe Merge pull request #479 from wernsaar/develop
workaround for sandybridge zgemm kernel
2014-12-23 00:59:41 +08:00
Werner Saar
587e16fba3 Ref #458: Backport, sandybrigde uses nehalem zgemm kernel 2014-12-22 17:01:18 +01:00
Werner Saar
4de7b9ae47 increased NMAX to 128 2014-12-22 14:04:27 +01:00
Werner Saar
887aed634d modified sources for OS Darwin 2014-12-19 12:40:46 +01:00
Werner Saar
6261342de3 small optimization on dgemm_kernel for N=1 2014-12-18 20:35:51 +01:00
Werner Saar
1e566223ed added code for the size of n 2014-12-17 15:02:11 +01:00
Werner Saar
113b48ca22 modified makefile for acml6.1 2014-12-17 14:12:21 +01:00
Zhang Xianyi
3e81c99b6b Fixed installation bug on Mac OSX. 2014-12-13 13:05:06 +08:00
Werner Saar
ec85c4a51d Increased the Threshold value in sep.in 2014-12-11 14:57:41 +01:00
Werner Saar
97de657d38 added tests to sep.as as workaround for gfortran-4.8.x 2014-12-11 13:53:59 +01:00
Zhang Xianyi
71966eba6c Merge pull request #475 from xantares/patch-2
add OpenBLAS_VERSION to cmake config file
2014-12-09 17:57:43 +08:00
Zhang Xianyi
a359979e17 Merge pull request #474 from xantares/patch-1
set OPENBLAS_CMAKE_DIR to <prefix>/lib/cmake/<package_name>
2014-12-09 17:57:16 +08:00
xantares
7a6a141bc4 add OpenBLAS_VERSION to cmake config file 2014-12-09 10:34:41 +01:00
xantares
b8ff6892f6 set OPENBLAS_CMAKE_DIR to <prefix>/lib/cmake/<package_name>
usually these files are more often located in this subdir
2014-12-09 10:18:18 +01:00
Zhang Xianyi
8fe7a9ce6f Merge pull request #473 from wernsaar/develop
changed inline assembler labels to short form
2014-12-08 13:22:18 +08:00
Werner Saar
bc5fff7085 changed inline assembler labels to short form 2014-12-07 12:38:54 +01:00
Zhang Xianyi
51ce5ef447 Merge branch 'develop' 2014-12-03 23:14:21 +08:00
Zhang Xianyi
1943ea91a8 Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2014-12-03 23:03:48 +08:00
Zhang Xianyi
37aee1f9b1 Merge branch 'develop' 2014-12-03 23:01:33 +08:00
Zhang Xianyi
f5424fc9de Update the doc for 0.2.13 version. 2014-12-03 23:00:29 +08:00
Zhang Xianyi
0cf29ba6d2 Fixed a bug of sgemm sandy bridge kernel.
Reported by Julia project. JuliaLang/julia#9084
2014-12-03 17:38:41 +08:00
Zhang Xianyi
50e18033e6 Merge pull request #471 from nolta/patch-4
c_check: set $hostarch to x86_64 instead of amd64
2014-12-03 12:53:20 +08:00
Zhang Xianyi
551b55d1c7 Merge pull request #470 from nolta/patch-3
fix fortran compiler detection on FreeBSD
2014-12-03 12:50:46 +08:00
Mike Nolta
271ceb8bae c_check: set $hostarch to x86_64 instead of amd64
`uname -m` returns "amd64" on some systems.
2014-12-02 21:23:23 -05:00
Mike Nolta
5f846be2e4 fix fortran compiler detection on FreeBSD
On FreeBSD, passing extra options to `which` causes it to report a non-zero status:

```
$ which gfortran48 -m64
/usr/local/bin/gfortran48
$ echo $?
1
```

```
$ which gfortran48
/usr/local/bin/gfortran48
$ echo $?
0
```
2014-12-02 20:47:40 -05:00
Zhang Xianyi
fe7dcf98f3 Refs #461. Provide OpenBLASConfig.cmake to support CMake.
If you "make PREFIX=/path/to/OpenBLAS install" ,
The config file will be located in /path/to/OpenBLAS/cmake

Then, you can use "find_package(OpenBLAS)" at CMake.
cmake -DOpenBLAS_DIR=/path/to/OpenBLAS/cmake ..
2014-11-29 02:16:40 +08:00
Zhang Xianyi
2fb02626da Update organization info. 2014-11-25 15:28:58 +08:00
Zhang Xianyi
a85c2785ae Refs #467. Added generic kernel file for x86_64. 2014-11-24 15:34:48 +08:00
Zhang Xianyi
4806715c97 Fixed #456. Merged the optimizations for APM's
xgene-1 (aarch64).
Merge branch 'benedikt-huber-dave-patch' into develop
2014-11-11 22:21:04 +08:00
Benedikt Huber
58c90d5937 # The first commit's message is:
Optimizations for APM's xgene-1 (aarch64).

1) general system updates to support armv8 better.  Make all did not work, one needed to supply TARGET=ARMV8.
2) sgem 4x4 kernel in assembler using SIMD, and configuration changes to use it.
3) strmm 4x4 kernel in C.  Since the sgem kernel does 4x4, the trmm kernel must also do 4xN.

Added Dave Nuechterlein to the contributors list.
2014-11-11 22:19:23 +08:00
Zhang Xianyi
2987bc7b40 refs #464. Fixed the bug of detecting L2 associative on x86. 2014-11-10 17:15:34 +08:00
Zhang Xianyi
695e0fa649 #463 fixed a compiling bug on AIX. 2014-11-10 14:39:56 +08:00
Zhang Xianyi
cbb23c46c2 Merge pull request #459 from tkelman/symbol-rename
add SYMBOLPREFIX and SYMBOLSUFFIX makefile options
2014-10-25 19:49:03 +08:00
Tony Kelman
0b4602b753 add SYMBOLPREFIX and SYMBOLSUFFIX makefile options
for adding a prefix or suffix to all exported symbol names in the shared library
Useful to avoid conflicts with other BLAS libraries, especially when using
64 bit integer interfaces in OpenBLAS

Note that since OSX does not have the objcopy utility, setting these options
to non-empty values on Mac requires the objconv tool, available (GPL license)
from http://www.agner.org/optimize/#objconv
2014-10-24 22:27:09 -07:00
Zhang Xianyi
7e4e195e82 Merge branch 'develop' 2014-10-13 17:10:41 +08:00
Zhang Xianyi
ac5a7e1c1b Update dot to 0.2.12 version. 2014-10-13 17:10:12 +08:00
wernsaar
f1b9a4a1ca Ref #454: fixed bug in common_param.h 2014-09-23 11:34:29 +02:00
Zhang Xianyi
ae6b7caf32 Merge pull request #453 from wernsaar/develop
Enabled GEMM3M functions
2014-09-22 16:47:54 +08:00
wernsaar
f446d2368a updated cblas.h and cblas_noconst.h 2014-09-21 13:39:15 +02:00
wernsaar
dab4edd069 added benchmark for gemm3m functions 2014-09-21 12:00:41 +02:00
wernsaar
9d7057366d bugfix for GEMM3M functions 2014-09-21 11:41:43 +02:00
wernsaar
7f234f8ed1 added GEMM3M tests 2014-09-21 10:55:08 +02:00
wernsaar
9e829ce98f enabled cblas gemm3m functions 2014-09-20 17:20:02 +02:00
wernsaar
d49fd33885 disabled SYMM3M and HEMM3M functions because segment violations 2014-09-20 15:27:40 +02:00
wernsaar
f0f9b25bb6 added test for CGEMM3M function 2014-09-20 14:53:30 +02:00
wernsaar
7aae4a62e7 enabled use of GEMM3M functions 2014-09-20 14:27:10 +02:00
wernsaar
7a911569b8 added test for GEMM3M functions 2014-09-20 14:21:42 +02:00
wernsaar
466bfb8b86 updated README.md 2014-09-17 16:01:07 +02:00
Zhang Xianyi
70d1ba09b2 Update the doc for target list. 2014-09-17 14:29:21 +08:00
Zhang Xianyi
d293b78b64 Merge pull request #451 from eshelman/patch-1
Add HASWELL to TargetList.txt
2014-09-17 14:20:06 +08:00
Eliot Eshelman
9912dbbcf9 Add HASWELL to TargetList.txt
The Intel "Haswell" architecture is missing from the list of build targets.
2014-09-16 18:26:45 -04:00
Zhang Xianyi
01bc462e8e Merge pull request #449 from wernsaar/develop
optimized multithreading lower limits
2014-09-16 14:33:48 +08:00
wernsaar
3300f5ebff optimized multithreading lower limits 2014-09-15 11:38:25 +02:00
Zhang Xianyi
59e2c20557 Merge pull request #448 from wernsaar/develop
Optimized cgemv and zgemv kernels
2014-09-15 13:12:14 +08:00
wernsaar
b7c9566eea removed obsolete gemv kernel files 2014-09-14 11:00:53 +02:00
wernsaar
6df1b0be81 optimized zgemv_n_microk_sandy-4.c 2014-09-14 10:21:22 +02:00
wernsaar
2ac1e076c1 added optimized zgemv_n kernel for sandybridge 2014-09-14 09:02:05 +02:00
wernsaar
9908b6031c bugfix in KERNEL.PILEDRIVER 2014-09-13 16:26:53 +02:00
wernsaar
8f100a14f2 optimized cgemv_t kernel for haswell 2014-09-13 16:13:27 +02:00
wernsaar
53b5726b04 added optimized cgemv_t kernel for haswell 2014-09-13 15:14:12 +02:00
wernsaar
1a352b24e6 updated KERNEL.HASWELL 2014-09-13 12:23:27 +02:00
wernsaar
5194818d4b updated zgemv_t_4.c 2014-09-13 09:48:34 +02:00
wernsaar
8a39cdb1c1 added optimized zgemv_t kernel for haswell 2014-09-13 09:47:07 +02:00
wernsaar
fd2478c9e2 optimized interface/zgemv.c for multithreading 2014-09-12 19:18:23 +02:00
wernsaar
0a1390f2d8 enabled optimized zgemv_t kernel for bulldozer 2014-09-12 17:43:47 +02:00
wernsaar
a8b0812feb optimized zgemv_t for bulldozer 2014-09-12 17:42:25 +02:00
wernsaar
a0fb68ab42 added optimized zgemv_t kernel for bulldozer 2014-09-12 17:04:22 +02:00
wernsaar
44c11165d5 bugfix in cgemv_t_4.c 2014-09-12 14:12:24 +02:00
wernsaar
564be4eb72 added optimized cgemv_t kernel 2014-09-12 13:38:01 +02:00
wernsaar
107c3ea7d5 added optimized zgemv_t routine 2014-09-12 12:35:20 +02:00
wernsaar
bb8d698335 optimized zgemv_n_microk_haswell-4.c for small size 2014-09-11 13:44:55 +02:00
wernsaar
e0192a6914 bugfix in zgemv_n_4.c 2014-09-11 13:18:00 +02:00
wernsaar
bced4594bb added optimized zgemv_n kernel 2014-09-11 12:34:57 +02:00
wernsaar
cafba99b6b bufix in cgemv_n_microk_haswell-4.c 2014-09-11 11:12:44 +02:00
wernsaar
ac8f232b2a more optimizations 2014-09-11 10:25:48 +02:00
wernsaar
f98e1244c4 optimized cgemv_n_4.c 2014-09-10 19:26:14 +02:00
wernsaar
be95700b30 added optimized cgemv_kernel for haswell 2014-09-10 14:11:24 +02:00
wernsaar
4aa534ae93 added cgemv_n kernel, optimized for small sizes 2014-09-10 13:45:13 +02:00
Zhang Xianyi
1cba8e7b11 Merge pull request #446 from grisuthedragon/cblas_matcopy
Add a CBLAS interface for the BLAS extension s/d/c/z*matcopy routines.
2014-09-10 16:31:31 +08:00
Zhang Xianyi
d13e92f07e Merge pull request #445 from wernsaar/develop
A lot of optimizations for gemv kernels
2014-09-10 16:28:14 +08:00
wernsaar
baa46e4fba added and tested optimized dgemv_n kernel for haswell 2014-09-09 16:17:45 +02:00
wernsaar
faab7a181d added optimized dgemv_n kernel for haswell 2014-09-09 15:32:32 +02:00
wernsaar
8109d8232c optimized dgemv_t kernel for haswell 2014-09-09 14:38:08 +02:00
wernsaar
debc6d1a05 bugfix in KERNEL.HASWELL 2014-09-09 14:04:44 +02:00
wernsaar
e73a0113ec added optimized gemv kernels 2014-09-09 13:54:55 +02:00
wernsaar
44f2bf9bae added optimized dgemv_t kernel for haswell 2014-09-09 13:34:22 +02:00
Martin Koehler
a057e5434d add CBLAS interface for s/d/c/zimatcopy 2014-09-09 09:52:13 +02:00
wernsaar
cd34e9701b removed obsolete files 2014-09-08 19:15:31 +02:00
Martin Köhler
7794766d3c Add cblas_(s/d/c/z)omatcopy in order to have cblas interface for them. 2014-09-08 17:57:44 +02:00
wernsaar
658939faaa optimized dgemv_n kernel for small sizes 2014-09-08 15:22:35 +02:00
wernsaar
f511807fc0 modified multithreading threshold 2014-09-08 12:27:32 +02:00
wernsaar
c4d9d4e5f8 added haswell optimized kernel 2014-09-08 12:25:16 +02:00
wernsaar
7c0a94ff47 bugfix in sgemv_n_microk_haswell-4.c 2014-09-08 10:54:33 +02:00
wernsaar
cbbc80aad3 added optimized sgemv_t kernel for haswell 2014-09-08 10:13:39 +02:00
wernsaar
2be5c7a640 bugfix for windows 2014-09-07 21:48:42 +02:00
wernsaar
80f7786875 enabled optimized sgemv kernels for piledriver 2014-09-07 21:13:57 +02:00
wernsaar
553e275407 optimized sgemv_n kernel for sandybridge 2014-09-07 20:53:30 +02:00
wernsaar
7b3932b3f3 optimized sgemv_n kernel for nehalem 2014-09-07 19:20:08 +02:00
wernsaar
75207b1148 optimized sgemv_n for very small size of m 2014-09-07 18:23:48 +02:00
wernsaar
274828fa50 optimizations for very small sizes 2014-09-07 13:45:03 +02:00
wernsaar
5ae1731fe6 better optimzations for sgemv_t kernel 2014-09-06 21:28:57 +02:00
wernsaar
c8eaf3ae2d optimized sgemv_t_4 kernel for very small sizes 2014-09-06 19:41:57 +02:00
wernsaar
3a7ab47ee9 optimized sgemv_t 2014-09-06 18:34:25 +02:00
wernsaar
cf5544b417 optimization for small size 2014-09-06 13:17:56 +02:00
wernsaar
d143f84dd2 added optimized sgemv_n kernel for haswell 2014-09-06 12:08:48 +02:00
wernsaar
7794237475 undef WHEREAMI 2014-09-06 11:01:42 +02:00
wernsaar
a64fe9bcc9 added optimized sgemv_n kernel for sandybridge 2014-09-06 08:41:53 +02:00
wernsaar
2021d0f9d6 experimentally removed expensive function calls 2014-09-05 15:05:53 +02:00
wernsaar
6df7a88930 optimized sgemv_t for sandybridge 2014-09-05 10:22:50 +02:00
wernsaar
53de943690 bugfix for sgemv_n_4.c 2014-09-04 18:55:52 +02:00
wernsaar
7f910010a0 optimized sgemv_n kernel for small sizes 2014-09-04 13:09:27 +02:00
wernsaar
3a5d8dbff9 optimized sgemv_n_4.c 2014-09-03 15:34:30 +02:00
wernsaar
2a60c6d4b0 optimized sgemv_n for small sizes 2014-09-03 14:48:45 +02:00
wernsaar
0fc560ba23 bugfix for buffer overflow 2014-09-03 10:13:47 +02:00
wernsaar
d1800397f5 optimized interface/gemv.c for multithreading 2014-09-02 17:36:07 +02:00
wernsaar
f4ff889491 updated interface/gemv.c for multithreading 2014-09-02 16:30:04 +02:00
wernsaar
210bec9111 added plot-header to compare multithreading 2014-09-02 14:11:42 +02:00
wernsaar
f3b50dcf5b removed obsolete instructions from sgemv_t_4.c 2014-09-02 13:35:41 +02:00
wernsaar
93eaba959d optimized sgemv_t for bulldozer 2014-09-02 12:42:36 +02:00
wernsaar
9570e56965 optimized sgemv_t_4.c for small sizes 2014-09-01 15:11:37 +02:00
wernsaar
d7f91f8b4f extended gemv.c benchmark 2014-09-01 15:07:36 +02:00
wernsaar
53f1277b6b modified benchmark/gemv.c 2014-08-31 15:38:18 +02:00
wernsaar
bc99faef1b optimized sgemv_t_4.c for uneven sizes 2014-08-31 14:33:15 +02:00
wernsaar
848c0f16f7 optimized sgemv_t_4.c for small size 2014-08-31 13:23:44 +02:00
wernsaar
e2fc8c8c2c changed 1 test value (bug in lapack-testing?) 2014-08-30 13:58:02 +02:00
wernsaar
53e6dbf6ca optimized sgemv_t kernel for small sizes 2014-08-30 13:36:27 +02:00
Zhang Xianyi
868f8a8756 Merge pull request #443 from idunham/fix
Workaround PIC limitations in cpuid.
2014-08-29 13:31:06 +08:00
Isaac Dunham
db7e6366cd Workaround PIC limitations in cpuid.
cpuid uses register ebx, but ebx is reserved in PIC.
So save ebx, swap ebx & edi, and return edi.

Copied from Igor Pavlov's equivalent fix for 7zip (in CpuArch.c),
which is public domain and thus OK license-wise.
2014-08-28 13:05:07 -07:00
Zhang Xianyi
2702323f7d Merge pull request #440 from wernsaar/develop
optimizations for leve1 and level2 blas functions
2014-08-28 12:43:54 +08:00
wernsaar
20cd850125 modification for clang compiler 2014-08-27 09:00:20 +02:00
wernsaar
5fa6158731 renoved flag no-integrated-as, because not working on macosx 2014-08-26 18:29:40 +02:00
wernsaar
84badf8086 EXPERIMENTAL: added the flag -no-integrated-as for clang compiler in Makefile.system 2014-08-26 17:36:32 +02:00
Zhang Xianyi
c8cc4a0d22 Fixed the typo in Changelog.txt 2014-08-26 16:14:34 +08:00
wernsaar
3885eebdb8 added optimized zaxpy bulldozer kernel 2014-08-25 15:52:35 +02:00
wernsaar
ee74445155 added optimized caxpy kernel for bulldozer 2014-08-25 14:53:28 +02:00
wernsaar
9d2ace8bac added optimized daxpy kernel for bulldozer 2014-08-24 10:57:12 +02:00
wernsaar
b55f997302 added optimized daxpy kernel for nehalem 2014-08-23 17:53:07 +02:00
wernsaar
29125864b3 updated gemm.c 2014-08-23 17:28:01 +02:00
wernsaar
e45c960c2c added optimized saxpy kernel for nehalem 2014-08-23 17:15:21 +02:00
wernsaar
55e81da379 added axpy benchmark-test 2014-08-23 13:12:44 +02:00
wernsaar
ac76b6267f added optimized dgemv_n kernel for nehalem 2014-08-23 10:40:57 +02:00
wernsaar
f1b96c4846 added optimized ddot kernel for bulldozer 2014-08-22 21:19:29 +02:00
wernsaar
16d6be852d added optimized ddot kernel for nehalem 2014-08-22 20:34:41 +02:00
wernsaar
53ec5789e2 bugfix for Makefile 2014-08-22 17:02:55 +02:00
wernsaar
95a707ced3 update of KERNEL.BULLDOZER 2014-08-22 17:01:27 +02:00
wernsaar
5d97b0754c added optimized sdot kernel for nehalem 2014-08-22 17:00:26 +02:00
wernsaar
8a9e868919 added optimized sdot for bulldozer 2014-08-22 14:29:17 +02:00
wernsaar
7e404de3de bugfix in Makefile 2014-08-22 11:51:30 +02:00
wernsaar
e4472ad850 added sdot and ddot benchmarks 2014-08-22 11:42:07 +02:00
wernsaar
fb0b4552a5 added hemv benchmark 2014-08-22 10:00:09 +02:00
wernsaar
6f73ffc114 added benchmarks for csymv and zsymv 2014-08-21 19:33:57 +02:00
wernsaar
c8b0645266 added optimized symv_L kernels for nehalem 2014-08-21 14:27:00 +02:00
wernsaar
ec05ff3f64 added optimized ssymv_L kernel for bulldozer 2014-08-21 13:32:06 +02:00
wernsaar
f6f9122660 added optimized dsymv_L kernel for bulldozer 2014-08-21 13:02:53 +02:00
wernsaar
8247f38dc1 added optimized dsymv_U kernel for nehalem 2014-08-20 09:58:04 +02:00
wernsaar
ef6374196d updated optimized dsymv_U kernel for bulldozer 2014-08-20 09:00:56 +02:00
wernsaar
f824c2b751 updated optimized ssymv_U for bulldozer 2014-08-19 19:25:03 +02:00
wernsaar
4ba4ab623f added optimized ssymv_U kernel for nehalem 2014-08-19 17:09:45 +02:00
wernsaar
4f39447c05 added optimized ssymv_U kernel for bulldozer 2014-08-18 13:52:24 +02:00
wernsaar
74c9465672 added optimized dsymv_U kernel for bulldozer 2014-08-18 12:18:10 +02:00
Zhang Xianyi
a7126c2ce4 Merge branch 'develop' 2014-08-18 11:16:14 +08:00
Zhang Xianyi
a69dd3fbc5 OpenBLAS 0.2.11 version. 2014-08-18 11:15:42 +08:00
wernsaar
101dd08173 add reference in C for symv_U 2014-08-16 13:52:50 +02:00
wernsaar
493d4fe7e5 added reference in C for symv_L 2014-08-16 11:36:48 +02:00
wernsaar
0a22816e70 Ref #433: removed obsolete lapack entries from common_interface.h 2014-08-15 12:40:10 +02:00
Zhang Xianyi
c3cd6e7e32 Merge pull request #434 from wernsaar/develop
A lot of performance enhancements
2014-08-15 08:07:27 +08:00
wernsaar
11eab4c019 added optimized cgemv_n for haswell 2014-08-14 19:00:30 +02:00
wernsaar
4568d32b6b added optimized cgemv_t kernel for haswell 2014-08-14 14:10:29 +02:00
wernsaar
c1a6374c6f optimized zgemv_n kernel for sandybridge 2014-08-13 16:10:03 +02:00
wernsaar
dc05937313 added additional test values 2014-08-13 14:54:50 +02:00
wernsaar
2470129132 added fast return, if m or n < 1 2014-08-13 13:54:19 +02:00
wernsaar
8c582d362d optimized zgemv_t_microk_haswell-2.c 2014-08-13 13:42:22 +02:00
wernsaar
11e34ddd1b bugfix for zgemv_n_microk_haswell-2.c 2014-08-13 12:54:18 +02:00
wernsaar
9528f0d9ee bugfix in zgemv_n_microk_sandy-2.c 2014-08-13 12:18:03 +02:00
wernsaar
b06550519e added optimized cgemv_t c-kernel 2014-08-12 12:15:41 +02:00
wernsaar
6093ee5363 bugfix in zgemv_n_microk_haswell-2.c 2014-08-12 10:02:25 +02:00
wernsaar
07c66b1960 modified algorithm for better numerical stability 2014-08-12 08:35:42 +02:00
wernsaar
58b075daef added optimized zgemv_t kernel for haswell 2014-08-11 16:57:52 +02:00
wernsaar
09fcd3a341 add optimized zgemv_t kernel for bulldozer 2014-08-11 14:19:25 +02:00
wernsaar
726ad085cb added optimized zgemv_t for haswell 2014-08-11 13:10:12 +02:00
wernsaar
6fe416976d added optimimized zgemv_t c-kernel 2014-08-11 09:13:18 +02:00
wernsaar
dbc2eff029 disabled optimized haswell zgemv_n kernel for windows ( bad rounding ) 2014-08-10 11:57:24 +02:00
wernsaar
462b4885ff added optimized zgemv_n kernel for haswell 2014-08-10 08:39:17 +02:00
wernsaar
aa54fe064c added zgemv_n c-function 2014-08-07 22:30:20 +02:00
wernsaar
006ef3ea01 added optimized dgemv_t kernel for haswell 2014-08-07 10:08:54 +02:00
wernsaar
60f17628cc added optimized dgemv_n kernel for haswell 2014-08-07 09:18:02 +02:00
wernsaar
c9bad1403a added optimized sgemv_t kernel for sandybridge 2014-08-07 07:49:33 +02:00
wernsaar
2f8927376f enabled optimized nehalem sgemv_t kernel for windows 2014-08-06 16:58:21 +02:00
wernsaar
d945a2b06d added optimized sgemv_t kernel for nehalem 2014-08-06 16:21:48 +02:00
wernsaar
ca6c8d06ce enabled optimized sgemv kernels for windows 2014-08-06 14:24:36 +02:00
wernsaar
7aa43c8928 enabled optimized sgemv kernels for windows 2014-08-06 14:06:30 +02:00
wernsaar
891b960854 added optimized sgemv_t kernel for haswell 2014-08-06 13:42:41 +02:00
wernsaar
95a8caa2f3 added optimized sgemv_t kernel 2014-08-06 12:12:17 +02:00
Zhang Xianyi
5c0d0ecbde Merge pull request #430 from wernsaar/develop
added a better optimized sgemv_n kernel
2014-08-06 02:52:30 +08:00
wernsaar
8c05b8105b bugfix in sgemv_n.c 2014-08-05 20:14:29 +02:00
wernsaar
c80084a98f changed default x86_64 sgemv_n kernel to sgemv_n.c 2014-08-05 19:42:56 +02:00
wernsaar
2bab92961f enabled optimized sgemv_n kernels for windows 2014-08-05 14:52:54 +02:00
wernsaar
9175b8bd5f changed long to blaslong for windows compatibility 2014-08-05 13:28:39 +02:00
wernsaar
793f2d43b0 added optimized sgemv_n kernel for nehalem 2014-08-05 10:50:08 +02:00
wernsaar
a4dde45f87 optimized sgemv_n kernel for sandybridge 2014-08-05 08:53:09 +02:00
wernsaar
7fa7ea3e1e updated haswell optimized sgmv_n kernel 2014-08-05 08:04:47 +02:00
wernsaar
3fbc13eb65 modified sgemv_n for haswell 2014-08-04 16:22:11 +02:00
wernsaar
db6917303f added a better optimized sgemv_n kernel for bulldozer and piledriver 2014-08-04 14:29:01 +02:00
Zhang Xianyi
c2fdeb6c22 Merge pull request #429 from idunham/numprocs
Fix link error on Linux/musl.
2014-08-04 08:12:23 +08:00
Isaac Dunham
f7eb81a846 Fix link error on Linux/musl.
get_nprocs() is a GNU convenience function equivalent to POSIX2008
sysconf(_SC_NPROCESSORS_ONLN); the latter should be available in unistd.h
on any current *nix. (OS X supports this call since 10.5, and FreeBSD
currently supports it. But this commit does not change FreeBSD or OS X
versions.)
2014-08-03 15:06:30 -07:00
Zhang Xianyi
edc329883c Merge pull request #427 from wernsaar/develop
added experimental support for big numa machines
2014-08-03 00:57:44 +08:00
wernsaar
793175be3a added experimental support for big numa machines 2014-08-02 13:40:16 +02:00
Zhang Xianyi
83c4ba8d32 Merge pull request #426 from wernsaar/develop
added benchmark program for lapack ?getri functions
2014-08-02 15:34:41 +08:00
wernsaar
271af406f3 bugfix for linux affinity code 2014-08-01 23:10:08 +02:00
wernsaar
f5f50b3563 added benchmarks for lapack potrf, potrs and potri functions 2014-08-01 21:08:37 +02:00
wernsaar
651dd22d7d added benchmark program for lapack ?getri functions 2014-08-01 08:55:20 +02:00
Zhang Xianyi
f329f77bd0 Merge pull request #425 from wernsaar/develop
added benchmark for lapack ?geev routines
2014-08-01 08:04:16 +08:00
wernsaar
7c611a2f95 bugfix for zgeev 2014-07-31 12:35:38 +02:00
wernsaar
296564e369 added lapack geev benchmark 2014-07-31 10:35:25 +02:00
Zhang Xianyi
27af6e35d3 Merge pull request #424 from ihnorton/fix_arm_cpuid
cpuid_arm: fix detection when cpuinfo uses "Processor"
2014-07-31 13:54:07 +08:00
Isaiah Norton
a183ad1df4 cpuid_arm: fix detection when cpuinfo uses "Processor"
instead of "model name"
2014-07-31 05:13:31 +00:00
wernsaar
799a0eabbd bugfix in cholesky.c 2014-07-30 14:00:19 +02:00
wernsaar
ca63503e61 extented plot-filter.sh for linpack and cholesky benchmarks 2014-07-30 13:03:42 +02:00
Zhang Xianyi
4f83217df6 Merge pull request #422 from wernsaar/develop
optimization of sandybridge cgemm-kernel
2014-07-30 17:09:58 +08:00
wernsaar
5087096711 optimization of sandybridge cgemm-kernel 2014-07-29 19:07:21 +02:00
Zhang Xianyi
21f7768b26 Merge pull request #421 from wernsaar/develop
optimized sgemm- and cgemm-kernel for haswell
2014-07-29 15:50:00 +08:00
wernsaar
46bc4fd50c optimized cgemm kernel for haswell 2014-07-29 08:53:09 +02:00
wernsaar
1cc02b4337 optimized sgemm kernel for haswell 2014-07-28 11:50:01 +02:00
Zhang Xianyi
6e223db7fc Merge pull request #420 from wernsaar/develop
Optimizations for HASWELL
2014-07-27 23:30:14 +08:00
wernsaar
1d33547222 optimized zgemm kernel for haswell 2014-07-27 11:51:42 +02:00
wernsaar
3ea4dadd30 optimizations for trsm 2014-07-25 11:59:17 +02:00
wernsaar
1b10ff129a optimizations for trmm 2014-07-25 10:00:23 +02:00
wernsaar
125610d23b allow to set custom value for ?GEMM_DEFAULT_UNROLL_MN, optimizations for syrk 2014-07-24 18:43:31 +02:00
wernsaar
e213a42cde added a sample plot-filter scripts and a header file for gnuplot 2014-07-21 14:50:24 +02:00
wernsaar
e4663be46a added symv benchmark 2014-07-21 07:50:54 +02:00
wernsaar
11637b6926 add benchmark for ger 2014-07-21 06:25:42 +02:00
Zhang Xianyi
80bf3e6a35 Merge pull request #419 from wernsaar/develop
added optimized sgemv kernels for Sandy Bridge, Haswell, Bullldozer, and Piledriver.
2014-07-20 23:35:17 +08:00
wernsaar
6acbafe45b added sgemv_n microkernel for haswell 2014-07-20 14:52:25 +02:00
wernsaar
5392d11b04 optimized sgemv_n_microk_sandy.c 2014-07-20 14:08:04 +02:00
wernsaar
c0fe95fb72 added sgemv_n microkernel for sandybridge 2014-07-20 13:17:47 +02:00
wernsaar
d9d4077c93 added sgemv_t microkernel for haswell 2014-07-20 11:30:32 +02:00
wernsaar
02eb72ac42 bugfix in sgemv_t_microk_sandy.c 2014-07-20 10:48:41 +02:00
wernsaar
c06f9986d4 added sgemv_t microkernel for sandybridge 2014-07-20 10:21:08 +02:00
wernsaar
2cce125c79 added optimized sgemv_t for bulldozer and piledriver 2014-07-19 15:48:07 +02:00
wernsaar
b3938fe371 don't use this sgemv_n on Windows 2014-07-19 07:15:34 +02:00
Zhang Xianyi
e6668dd83b Merge pull request #414 from staticfloat/sf/symlinkfix
Don't create an absolute symlink when installing on Darwin
2014-07-18 23:13:18 +08:00
wernsaar
c8a4a56177 performance optimizations for sgemv_n 2014-07-18 11:25:21 +02:00
wernsaar
3c5732615d added blocked sgemv_n and microkernel for bulldozer and piledriver 2014-07-17 23:15:07 +02:00
Zhang Xianyi
f20c0f9819 Merge branch 'develop' 2014-07-17 15:15:57 +08:00
Zhang Xianyi
134fa320e6 Refs #415. Fixed the x86/i386 compiling bug with DYNAMIC_ARCH=1. 2014-07-17 15:02:01 +08:00
Elliot Saba
a79df1ff49 Don't create an absolute symlink when installing on Darwin 2014-07-16 15:31:27 -04:00
wernsaar
7ceb25d7b3 changed string GFORTRAN to lowercase 2014-07-16 17:08:43 +02:00
Zhang Xianyi
21b5347fbe Merge branch 'develop' 2014-07-16 18:04:30 +08:00
Zhang Xianyi
f2eb480738 OpenBLAS 0.2.10 version. 2014-07-16 18:04:18 +08:00
Zhang Xianyi
c94762bb56 Refs #401. Added NO_AVX2 flag for old binutils (e.g. RHEL6) 2014-07-16 08:38:25 +08:00
wernsaar
51413925bd adjust number of threads for small size in cgemv and zgemv 2014-07-15 16:27:02 +02:00
wernsaar
b985cea65d adjust number of threads for sgemv and dgemv 2014-07-15 16:04:46 +02:00
wernsaar
d286daa2ba adjusted number of threads for small size 2014-07-15 14:41:35 +02:00
wernsaar
bcb115b55b added benchmark for gemv 2014-07-15 13:35:36 +02:00
Zhang Xianyi
3dd094f17a Merge pull request #413 from wernsaar/develop
additional benchmarks
2014-07-14 22:39:22 +08:00
wernsaar
339ab34c4c added additional test value to dstest.in 2014-07-13 18:29:19 +02:00
wernsaar
7424e2b609 added additional test value 2014-07-13 18:26:38 +02:00
wernsaar
73594cff73 segment violation in x86_64 sgemv kernels 2014-07-13 10:49:43 +02:00
wernsaar
880597b301 segment violation in sgemv kernels 2014-07-13 10:46:14 +02:00
wernsaar
9c835431d0 modified pathes to atlas, mkl and acml 2014-07-12 16:20:29 +02:00
wernsaar
1d4ffddf69 added conf option for number of loops 2014-07-12 11:54:39 +02:00
wernsaar
b0e7810a6b added her2k benchmark 2014-07-11 16:31:05 +02:00
wernsaar
2b92a8c499 added herk benchmark 2014-07-11 16:16:48 +02:00
wernsaar
274b8dc91a add hemm benchmark 2014-07-11 15:26:34 +02:00
wernsaar
74b237ca22 added syr2k benchmark 2014-07-11 14:48:25 +02:00
wernsaar
c353abd38c added syrk benchmark 2014-07-11 14:21:25 +02:00
wernsaar
0acce17979 added trsm benchmark 2014-07-11 13:51:08 +02:00
wernsaar
2016a685e6 added trmm benchmark 2014-07-11 13:20:42 +02:00
wernsaar
1b9a6aac30 added benchmark for symm 2014-07-11 12:47:48 +02:00
wernsaar
e27433ab6a added gemm benchmark and modified Makefile for benchmark 2014-07-11 11:09:47 +02:00
Zhang Xianyi
7961404a40 Merge pull request #411 from wernsaar/develop
Lapack-test on x86 32bit now runs without errors.
2014-07-10 22:38:15 +08:00
wernsaar
cedc1f4b14 Ref #410: disabled optimized potri functions ( single threading bug) 2014-07-10 13:42:32 +02:00
wernsaar
0884b73c69 Lapack-test Windows 32bit now error free 2014-07-10 11:01:47 +02:00
wernsaar
9bd9472ae9 Lapack-test: cleanup of x86 32bit KERNEL file 2014-07-09 16:08:19 +02:00
Zhang Xianyi
2e2473f390 Merge pull request #409 from wernsaar/develop
some fixes for Lapack and ARM platform
2014-07-09 21:11:00 +08:00
wernsaar
c4a423a642 bugfixes for lapack on ARM Platform 2014-07-09 12:21:39 +02:00
Zhang Xianyi
f9991fd5f6 Merge branch 'develop' 2014-07-09 08:48:00 +08:00
Zhang Xianyi
47688e24e9 OpenBLAS 0.2.10 rc2 version. 2014-07-09 08:47:36 +08:00
wernsaar
61ef0c3419 added cross compiler examples for 32bit and 64bit ARM 2014-07-08 12:55:18 +02:00
Zhang Xianyi
698e77dba4 Refs #406. Fixed utest building bug. 2014-07-08 17:26:49 +08:00
wernsaar
2081f6e8ff Lapack bug114: replaced cgesvd.f and zgesvd.f 2014-07-08 10:21:10 +02:00
wernsaar
dc6b809f15 Lapack bug117: replaced zstemr.f 2014-07-08 10:08:34 +02:00
wernsaar
0f08684649 Lapack bug118: replaced clanhf.f and zlanhf.f 2014-07-08 09:57:40 +02:00
Zhang Xianyi
552119c484 Fixed #407. Support outputing the CPU corename on runtime.
The user can use char * openblas_get_config() or char * openblas_get_corename().
2014-07-08 12:48:08 +08:00
Zhang Xianyi
94d3cfaa10 Merge pull request #404 from wernsaar/develop
A lot of fixes for v0.2.10-rc2
2014-07-07 00:39:33 +08:00
wernsaar
13348b2137 removed reference to daxpy_bulldozer kernel (Windows bug in lapack-test) 2014-07-06 16:39:32 +02:00
wernsaar
783a7d2202 bugfix for fortran compiler 2014-07-06 13:33:42 +02:00
wernsaar
50e99a52ea added definitions for PILEDRIVER and HASWELL 2014-07-06 12:08:27 +02:00
wernsaar
9964ed2f79 bugfix for CORE2 2014-07-06 11:47:28 +02:00
wernsaar
d5b976f92d fallback to zgemm_kernel_4x2_sse.S 2014-07-06 11:05:28 +02:00
wernsaar
f7267d9b0e added missing definition for DUNNINGTON 2014-07-06 10:17:07 +02:00
wernsaar
e0c080a28c removed reference to zgemm_kernel_4x2_sse3.S (bug in lapack-test) 2014-07-05 16:13:17 +02:00
wernsaar
e80b144932 enabled compiling of *3M functions 2014-07-02 14:11:53 +02:00
wernsaar
02a504c0b8 fixed my bug in ger.c 2014-07-02 10:39:33 +02:00
wernsaar
be94db096c disabled *3M functions for x86_64 platforms 2014-07-01 16:18:05 +02:00
wernsaar
b079df9ef4 added optimized sdot- and dsdot-kernel, written in C 2014-06-30 14:46:38 +02:00
wernsaar
aee61456a4 disabled SMP for sbmv and zsbmv again 2014-06-29 21:18:38 +02:00
wernsaar
01a119abfc enabled SMP for sbmv and zsbmv, but only for 64bit binaries 2014-06-29 20:35:56 +02:00
wernsaar
1fad2b759f enabled smp for ger.c and zger.c, but only for 64bit binaries 2014-06-29 16:43:04 +02:00
wernsaar
e1e83a1b71 modification, to run blas-test on Windows 2014-06-29 10:15:29 +02:00
Zhang Xianyi
da3d70420a Merge branch 'develop' 2014-06-29 10:46:22 +08:00
Zhang Xianyi
1127f5a2d7 OpenBLAS 0.2.10 rc1 version. 2014-06-29 10:45:50 +08:00
Zhang Xianyi
0ae4cc2803 Merge branch 'wernsaar-develop' into develop 2014-06-29 10:40:54 +08:00
Zhang Xianyi
99efbbbad5 Fixed #395. Enable optimized cgemm for Sandybridge. Added optimized sdot kernel.
Fixed c/zgemm, zgemv computational error of haswell, piledriver, bullldozer, and
barcelona on Windows.

Merge branch 'develop' of https://github.com/wernsaar/OpenBLAS into wernsaar-develop

Conflicts:
	kernel/Makefile.L1
	kernel/x86_64/KERNEL
	param.h
2014-06-29 10:34:51 +08:00
wernsaar
22e5aee2dd fixed zgemv bug for older AMD Processors 2014-06-28 19:04:49 +02:00
Zhang Xianyi
249917700d Merge branch 'TimothyGu-develop' into develop
Fixed #398. Remove all trailing whitespace except lapack-netlib.
2014-06-28 20:52:07 +08:00
Zhang Xianyi
7a8949e0ce Merge branch 'develop' of https://github.com/TimothyGu/OpenBLAS into TimothyGu-develop
Conflicts:
	driver/others/memory.c
2014-06-28 20:51:31 +08:00
Zhang Xianyi
b82108f899 Merge pull request #399 from TimothyGu/upstr
Build import libs as .dll.a instead of .lib
2014-06-28 20:40:23 +08:00
Zhang Xianyi
8373ad4ec2 Merge pull request #397 from vtjnash/develop
fix #394
2014-06-28 20:38:48 +08:00
wernsaar
35d37e124f bugfix for barcelona zgemv-kernel 2014-06-28 12:36:11 +02:00
wernsaar
d8ba46efdb bugfix for bulldozer cgemm-, zgemm- and zgemv-kernel 2014-06-28 12:16:20 +02:00
wernsaar
a15f22a1f6 bugfix for piledriver cgemm-, zgemm- and zgemv-kernel 2014-06-28 11:46:58 +02:00
wernsaar
b94ea89f52 bugfix for haswell cgemm- and zgemm-kernel 2014-06-28 10:22:40 +02:00
wernsaar
35f668bb14 bugfix for cgemm_kernel_8x2_sandy.S 2014-06-28 10:01:56 +02:00
Timothy Gu
4ebbf758f5 .gitignore: add some more entries concerned with kernel
Signed-off-by: Timothy Gu <timothygu99@gmail.com>
2014-06-27 13:58:42 -07:00
Timothy Gu
8615d6ec87 Build import libs as .dll.a instead of .lib
This is MinGW convention.

Signed-off-by: Timothy Gu <timothygu99@gmail.com>
2014-06-27 13:34:09 -07:00
Timothy Gu
6c2ead30f0 Remove all trailing whitespace except lapack-netlib
Signed-off-by: Timothy Gu <timothygu99@gmail.com>
2014-06-27 12:05:18 -07:00
Jameson Nash
f41f03ab83 fix #394. this cleans up some handles after using them, and doesn't disable ALL process privileges upon success 2014-06-27 12:16:57 -04:00
wernsaar
365e8de346 added optimized cgemm-kernel for SANDYBRIDGE 2014-06-27 13:40:29 +02:00
wernsaar
578d1b6219 added DSDOT definition and enabled optimized sdot kernel 2014-06-27 11:30:29 +02:00
wernsaar
a6ae079b17 added blas-test from lapack 2014-06-27 10:12:19 +02:00
Zhang Xianyi
d10db52edb Merge pull request #390 from wernsaar/develop
Ref #103: enhancement for small matrix dimensions. Fixed some bugs. Enable sgemm for SNB and dgemm for NEHALEM
2014-06-27 14:57:06 +08:00
wernsaar
dabab2b5f4 added new optimized sgemm kernel for SANDYBRIGE 2014-06-26 21:42:08 +02:00
wernsaar
aa2709c4e0 enabled optimized dgemm kernel for NEHALEM 2014-06-26 12:22:29 +02:00
wernsaar
9d6f2b594e Fortran flag -frecursive is disabled by default 2014-06-25 13:55:19 +02:00
wernsaar
a13bcc1716 enabled optimized sgemv kernel for barcelona and piledriver 2014-06-25 13:50:57 +02:00
wernsaar
d2c82d7543 enabled optimized sgemv kernel for HASWELL 2014-06-25 12:56:45 +02:00
wernsaar
0517672dd0 enabled optimized sgemv kernels for nehalem, sandybridge and bulldozer 2014-06-25 12:38:14 +02:00
wernsaar
15d5dfa92c fixed compiler warnings 2014-06-25 11:32:44 +02:00
wernsaar
d83373db61 added parameter for gemm3m kernels 2014-06-25 10:40:25 +02:00
wernsaar
88b6bf251a force fallback for x86 32bit 2014-06-22 17:27:11 +02:00
wernsaar
4a2ab7460b Ref #391: force fallback for x86 32bit 2014-06-22 13:51:17 +02:00
wernsaar
86d8c8978b Ref #391: disabled SMP in ger.c and zger.c 2014-06-22 12:01:24 +02:00
wernsaar
316df0e821 fixed bug for INTERFACE64 2014-06-22 09:49:20 +02:00
wernsaar
438002204d Ref #393: fix for INTERFACE64=0 and ARCH_X86 in divtable 2014-06-21 12:29:23 +02:00
wernsaar
23203d52c1 Ref #380: lowered stack usage for haswell kernels 2014-06-19 14:31:52 +02:00
wernsaar
73545a79cd Ref #380: lowered stack usage for piledriver and bulldozer kernels 2014-06-19 14:02:14 +02:00
wernsaar
a19d209005 Ref #103: enhancement for small matrix dimensions 2014-06-18 15:04:11 +02:00
Zhang Xianyi
8602816536 Merge pull request #387 from davidanthoff/fixbuilderroronwin
Add -lgfortran flag to gcc call in a makefile
2014-06-18 07:57:30 +08:00
Zhang Xianyi
d52863cfd7 Merge pull request #386 from wernsaar/develop
Some enhancements for dynamic_arch and some warning fixes
2014-06-18 07:56:08 +08:00
David Anthoff
c6361d63c2 Add -lgfortran flag to gcc call in a makefile
Adding $(EXTRALIB) adds this flag when things are built with
msys2 on windows. Without this the build fails.
2014-06-13 21:10:27 -07:00
wernsaar
53bfa51ee0 Ref #385: fixed warnings in dynamic.c 2014-06-12 18:17:08 +02:00
wernsaar
ff9cfca24c Ref #385: added missing return instruction 2014-06-12 15:52:14 +02:00
wernsaar
a86d349a51 Ref #380: enhancements for dynamic_arch 2014-06-12 14:20:03 +02:00
Zhang Xianyi
7b277f0110 Merge pull request #384 from wernsaar/develop
Blas extensions
2014-06-11 09:49:27 +08:00
wernsaar
faeab93df0 Ref #51: added blas extensions simatcopy, dimatcopy, cimatcopy, zimatcopy 2014-06-10 16:14:34 +02:00
Zhang Xianyi
f773f492f3 Merge branch 'develop' 2014-06-10 21:55:47 +08:00
Zhang Xianyi
21a6b5f79e OpenBLAS 0.2.9 Version. 2014-06-10 21:55:19 +08:00
wernsaar
cee257f384 Ref #51: added blas extensions zomatcopy and comatcopy 2014-06-10 10:34:54 +02:00
wernsaar
7bfb3011e8 Ref #51: added blas extension somatcopy 2014-06-09 20:21:13 +02:00
wernsaar
8c8f596238 Ref #51: added blas extension domatcopy as not opimized reference 2014-06-09 17:11:07 +02:00
wernsaar
bff575d0b1 Ref #375: added workaround for small sizes to scal.c and zscal.c 2014-06-08 13:49:19 +02:00
wernsaar
faf3ac0aad Ref #285: added axpby kernels 2014-06-08 11:54:24 +02:00
Zhang Xianyi
a40116de25 Fixed generating DLL bug. 2014-06-06 16:13:08 +08:00
Zhang Xianyi
b31ec99372 Fixed #374.
Merge branch 'TimothyGu-develop' into develop
2014-06-05 17:01:44 +08:00
Zhang Xianyi
0ac073fa94 Merge pull request #376 from wernsaar/develop
Merged some Lapack optimized functions
https://github.com/xianyi/OpenBLAS/wiki/Fixed-optimized-kernels-To-do-List
2014-05-26 04:46:06 -05:00
wernsaar
25e899b60b fixed function profile in zpotri.c 2014-05-25 09:15:22 +02:00
wernsaar
219bcb119d added lapack and lapacke timing libs by default 2014-05-24 15:53:25 +02:00
wernsaar
5664445543 changed threshold value for sep.in from 50.0 to 60.0 2014-05-23 17:26:50 +02:00
wernsaar
89da450800 enabled and tested optimized potri lapack functions 2014-05-23 12:14:30 +02:00
wernsaar
c26bbee489 enabled abd tested optimized trtri lapack functions 2014-05-23 10:55:39 +02:00
Timothy Gu
ced13574a0 Random "walk (a)round" --> "work-around" typo fixes
Signed-off-by: Timothy Gu <timothygu99@gmail.com>
2014-05-22 18:11:52 -07:00
Timothy Gu
fe858873af Add NO_STATIC variable which disables static lib installation
Static library is still built for shared lib generation.

Signed-off-by: Timothy Gu <timothygu99@gmail.com>
2014-05-22 18:06:26 -07:00
Timothy Gu
a8d4d1c4d3 Build import library for mingw
Signed-off-by: Timothy Gu <timothygu99@gmail.com>
2014-05-22 18:06:26 -07:00
wernsaar
c4ccb3fbb2 removed lapack/getri because it was never used 2014-05-21 14:21:19 +02:00
wernsaar
a748d3a75d enabled optimized trti2 lapack functions again 2014-05-21 11:02:07 +02:00
wernsaar
a5ab231ad4 enabled optimized complex lauum lapack functions again 2014-05-21 10:35:28 +02:00
wernsaar
dbaeea7b59 enabled lauu2 and lauum lapack functions again 2014-05-21 09:49:18 +02:00
Zhang Xianyi
10a16bd690 Refs #372. Fixed a lot of bugs about LAPACK testing.
As a walk round solution, we rolled back some kernels.

Please check https://github.com/xianyi/OpenBLAS/wiki/Fixed-optimized-kernels-To-do-List

Merge branch 'wernsaar-develop' into develop
2014-05-21 11:36:46 +08:00
Zhang Xianyi
406f5bd22b Merge branch 'develop' of https://github.com/wernsaar/OpenBLAS into wernsaar-develop
Conflicts:
	kernel/arm/KERNEL.ARMV6
2014-05-21 11:24:39 +08:00
wernsaar
a0ae53966f removed debug flag from Makefile.rule 2014-05-19 15:57:18 +02:00
wernsaar
0d75f3b6a2 enabled and tested optimized gesv lapack functions 2014-05-19 14:44:53 +02:00
wernsaar
abad6f66d6 marked trti2.c and ztrti2.c as bad 2014-05-19 13:50:02 +02:00
wernsaar
2ff66e661d enabled and tested optimized laswp lapack function 2014-05-19 13:35:32 +02:00
wernsaar
5e55034922 marked zlauu2.c and zlauum.c as bad 2014-05-19 12:53:22 +02:00
wernsaar
9a9e810239 marked trtri.c and ztrtri as bad 2014-05-19 12:42:52 +02:00
wernsaar
45be9ac111 moved trtri.c and ztrtri.c to the directory lapack 2014-05-19 12:29:29 +02:00
wernsaar
9f201558c9 marked lauu2.c and lauum.c as bad 2014-05-19 12:00:16 +02:00
wernsaar
d4237cb7f3 marked larf.c as obsolete 2014-05-19 11:23:17 +02:00
Zhang Xianyi
d2a8ff4b04 Merge branch 'TimothyGu-develop' into develop 2014-05-19 10:37:20 +08:00
Timothy Gu
f331cb1a76 Remove code for downloading lapack tarball and the patches themselves
They are not used anymore since 3eb5af1.

Signed-off-by: Timothy Gu <timothygu99@gmail.com>
2014-05-18 19:09:26 -07:00
Timothy Gu
9ed981c5dc Remove unused dll2 target
Signed-off-by: Timothy Gu <timothygu99@gmail.com>
2014-05-18 18:54:38 -07:00
wernsaar
aaa9d7fbf8 marked potri functions as bad because a lot of errors 2014-05-18 23:41:13 +02:00
wernsaar
ebc95e6f11 enabled and tested optimized potf2 lapack functions 2014-05-18 22:41:43 +02:00
wernsaar
61a2c50e8e enabled and tested optimized getf2 lapack functions 2014-05-18 22:21:16 +02:00
wernsaar
4f98f8c9b3 enabled and tested optimized potrf lapack functions 2014-05-18 21:42:37 +02:00
wernsaar
536875d463 enabled and tested optimized getrs lapack functions 2014-05-18 21:13:56 +02:00
wernsaar
65f2fba4c3 enabled and tested optimized cgetrf lapack function 2014-05-18 20:32:27 +02:00
wernsaar
eea6f51df9 enabled and tested optimized sgetrf lapack function 2014-05-18 20:01:23 +02:00
wernsaar
6fc4646709 enabled and tested optimized zgetrf lapack function 2014-05-18 19:36:32 +02:00
wernsaar
ac029f81b3 enabled and tested optimized dgetrf function 2014-05-18 19:07:51 +02:00
wernsaar
c0cf875a82 added optimized lapack files from OpenBLAS 2014-05-18 14:09:22 +02:00
Timothy Gu
b6d904838e Remove routines for generating exports/symbol.S
Signed-off-by: Timothy Gu <timothygu99@gmail.com>
2014-05-17 16:02:36 -07:00
Timothy Gu
5379eff022 Remove routines for making exports/linux.def
Signed-off-by: Timothy Gu <timothygu99@gmail.com>
2014-05-17 16:01:30 -07:00
wernsaar
aaddb05411 bugfix for ARMV6 2014-05-17 13:00:36 +02:00
wernsaar
e52532a9fe enable debug for lapack testing 2014-05-17 11:18:26 +02:00
wernsaar
e826a5a6af some modifications regarding lapack test 2014-05-16 20:37:41 +02:00
wernsaar
165d5436b5 changed threshold to 50.0 2014-05-16 20:34:48 +02:00
wernsaar
409b52255c changed default optimization flag from O3 to O2 for ARM 2014-05-16 14:36:24 +02:00
wernsaar
5953972a5a changed threshold for 50.0 to 54.0 in svd.in 2014-05-16 14:32:10 +02:00
wernsaar
d751224ea4 changed YIELDING for BULLDOZER 2014-05-15 11:37:38 +02:00
wernsaar
4a5938b5cc Modified lapack-test, using lapack_testing.py to run tests 2014-05-14 15:16:21 +02:00
wernsaar
d18bc5468f added FCOMMON_OPT for lapack 2014-05-14 15:01:03 +02:00
wernsaar
8877c6db51 changed label lapack-test 2014-05-14 13:08:05 +02:00
wernsaar
c38379c9dd bugfixes for ARM regarding lapack tests 2014-05-14 13:03:45 +02:00
wernsaar
a0b07c1440 bugfixs for ARM regarding lapack tests 2014-05-14 12:59:20 +02:00
wernsaar
43fbdb7a5a added ARMV5 as reference platform 2014-05-13 17:25:19 +02:00
wernsaar
777cebc8c7 added ZERO check to zscal.c because bug in lapack-testing 2014-05-13 16:31:00 +02:00
wernsaar
aa5c73e20f added ZERO check to zscal.c because bug in lapack-test 2014-05-13 16:25:21 +02:00
wernsaar
5e5ef28ca0 added ZERO check because bug in lapack-test 2014-05-13 15:36:03 +02:00
wernsaar
650ed34336 added ZERO check because bug in lapack-test 2014-05-13 15:31:36 +02:00
wernsaar
189ca1bcee removed lapack objects from interface/Makefile 2014-05-11 12:09:34 +02:00
wernsaar
4c1caa7454 checked, that zhpr is OK 2014-05-11 11:21:23 +02:00
wernsaar
7bb19cf90e checked, that zhpr2 is OK 2014-05-11 11:11:05 +02:00
wernsaar
2a94aaaf2e checked, that zhpmv is OK 2014-05-11 10:46:48 +02:00
wernsaar
5e4b4f6712 checked, that zher is OK 2014-05-11 10:36:34 +02:00
wernsaar
47e8950e77 checked, that zher2 is OK 2014-05-11 10:26:05 +02:00
wernsaar
f45f2c8465 checked, that zhemv is OK 2014-05-11 10:15:06 +02:00
wernsaar
10780ae650 marked zhbmv as smp bug 2014-05-11 09:58:16 +02:00
wernsaar
9bae50f700 checked, that zscal and zswap are OK 2014-05-11 09:30:18 +02:00
wernsaar
0758c1a374 checked, that trtri is OK 2014-05-11 09:11:20 +02:00
wernsaar
564ff395f6 checked, that trsm is OK 2014-05-11 08:59:33 +02:00
wernsaar
7fb78a5f01 checked, that trmv is OK 2014-05-11 08:47:44 +02:00
wernsaar
8204ab4aa8 checked, that tpmv is OK 2014-05-11 08:35:34 +02:00
wernsaar
48d1325784 checked, that tbmv is OK 2014-05-11 08:22:00 +02:00
wernsaar
57bbc586ef checked, that syrk is OK 2014-05-11 08:10:25 +02:00
wernsaar
bfef3c5dd1 checked, that syr is OK 2014-05-11 07:46:22 +02:00
wernsaar
d972f4a60a check, that syr2k is OK 2014-05-11 01:04:46 +02:00
wernsaar
eebce01cf2 checked, that syr2 is OK 2014-05-11 00:48:49 +02:00
wernsaar
e2c39a4a8e checked, that symv is OK 2014-05-11 00:36:56 +02:00
wernsaar
1e8e6faa7e checked, that symm is OK 2014-05-11 00:22:40 +02:00
wernsaar
c7eb901496 checked, that spr is OK 2014-05-11 00:07:07 +02:00
wernsaar
2ed03ea0a2 checked, that spr2 is OK 2014-05-10 23:55:43 +02:00
wernsaar
de00e2937a marked as smp bug 2014-05-10 23:18:35 +02:00
wernsaar
e187b5e9d0 removed gesv.c from interface 2014-05-10 22:55:44 +02:00
wernsaar
0947fc1c89 checked, that ger is OK 2014-05-10 22:49:53 +02:00
wernsaar
4d61607c9e cheched, that gbmv is OK 2014-05-10 22:38:09 +02:00
wernsaar
781bfb6e66 checked, that gemv is OK 2014-05-10 22:24:05 +02:00
wernsaar
79a82ba7f1 checked that axpy is OK 2014-05-10 22:09:49 +02:00
wernsaar
d63bd7fa5e checked that gemm.c is OK 2014-05-10 21:51:44 +02:00
wernsaar
e265c4ec86 added C files in interface 2014-05-10 21:27:47 +02:00
wernsaar
0732238213 removed all C files in interface 2014-05-10 21:25:17 +02:00
wernsaar
5f3b68b4d4 replaced sgemm and cgemm kernels because lapack bugs 2014-05-10 11:24:07 +02:00
wernsaar
2424af62fd replaced dgemm-kernel because bug in lapack 2014-05-10 10:52:37 +02:00
wernsaar
6b252033ae changed test ratio from 30.0 to 40.0 2014-05-09 13:17:47 +02:00
wernsaar
320c805905 fixed incorrect parameter 2 errors 2014-05-08 11:06:32 +02:00
wernsaar
e673848a9b added log file for lapack development 2014-05-07 14:36:49 +02:00
wernsaar
a35a1a9ae7 changed makefiles for lapack development 2014-05-07 11:33:02 +02:00
wernsaar
793509a3b5 replaced files for sdot, sgemv_n and sgemv_t for bug #348 2014-05-06 15:29:39 +02:00
Zhang Xianyi
020f36f970 Merge pull request #367 from xantares/patch-2
Makefile typo
2014-05-02 17:55:08 +08:00
Zhang Xianyi
9d0cc399ac Merge pull request #366 from xantares/patch-1
Install dll to prefix/bin instead of prefix/lib
2014-05-02 17:54:22 +08:00
wernsaar
025fc914cc fixed 2 bugs as reported by Brendan Tracey 2014-05-02 11:34:26 +02:00
xantares
43bb633096 Update Makefile 2014-05-02 08:54:22 +02:00
xantares
187237b622 Install dll to prefix/bin instead of prefix/bin 2014-05-01 21:48:26 +02:00
Zhang Xianyi
66198faab6 Refs #63. delete prefix for mingw64 toolchain. 2014-04-27 13:05:26 +08:00
wernsaar
47b22763f8 reduced stack usage on windows to 16K 2014-04-24 14:09:26 +02:00
Zhang Xianyi
4d42368214 Refs #355. Fixed ARM detection bug. 2014-03-22 15:08:18 +08:00
Zhang Xianyi
3e068e78e2 Merge branch 'release-0.2.9' 2014-03-06 17:45:31 +08:00
Zhang Xianyi
1140c489c9 #351. Release 0.2.9 rc2. 2014-03-06 17:44:03 +08:00
Zhang Xianyi
804a306313 Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2014-03-06 12:53:10 +08:00
wernsaar
9db0fb8b02 bugfix for sdsdot 2014-02-28 14:59:36 +01:00
wernsaar
692b14cecd rewrote rotmg.c instead of modifying very old code 2014-02-28 14:43:28 +01:00
Zhang Xianyi
322a178430 Merge pull request #345 from ogrisel/fix-non-smp-server-pthread_atfork-reference
Do not reference pthread_atfork in non-SMP_SERVER mode
2014-02-26 00:54:01 +08:00
Zhang Xianyi
f80f29e256 Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2014-02-26 00:38:29 +08:00
Olivier Grisel
2c556f093a Add cast to function pointer to remove warning 2014-02-25 11:08:32 +01:00
Olivier Grisel
3b027d2528 Do not reference pthread_atfork in non-SMP_SERVER mode 2014-02-25 11:08:32 +01:00
Zhang Xianyi
57526cae99 Merge pull request #346 from ogrisel/fix-openblas_config.h
More robust OPENBLAS_ prefixing of macros in openblas_config.h
2014-02-25 06:43:30 +08:00
Olivier Grisel
5de5ef118c More robust OPENBLAS_ prefixing of macros in openblas_config.h 2014-02-24 13:21:06 +01:00
Zhang Xianyi
b161ac29e3 Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2014-02-21 07:15:35 +08:00
Zhang Xianyi
b20ee6924a Merge pull request #343 from ogrisel/fix-294-fork-safe-pthread
FIX #294: fork-safe pthread mode
2014-02-20 06:58:27 +08:00
Olivier Grisel
49bd98f410 Do not reference pthread_atfork under windows 2014-02-19 19:25:48 +01:00
Olivier Grisel
a14f98ca7c Make sure that fork_test.c is not built under windows 2014-02-19 19:14:13 +01:00
Olivier Grisel
138a841390 FIX #294: make OpenBLAS thread-pool resilient to fork via pthread_atfork 2014-02-19 19:01:15 +01:00
Olivier Grisel
046e4013cb Revert "Refs #294. Used pthread_atfork to avoid hang after a Unix fork."
This reverts commit 3617c22a56.
2014-02-19 18:32:54 +01:00
Zhang Xianyi
dd2d3e61ab Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop 2014-02-18 15:53:57 +08:00
Zhang Xianyi
3617c22a56 Refs #294. Used pthread_atfork to avoid hang after a Unix fork.
The problem is the mutex we used in blas_server. Thus, we must clear
the mutex before the fork and re-init them at parent and child process.

If you used OpenMP, GOMP has the same problem by now. Please try other OpenMP
implemantation.
2014-02-18 15:36:04 +08:00
wernsaar
f9daebba0a checked in bugfixes for ARM 2014-02-16 11:45:47 +01:00
Zhang Xianyi
9a557e90da Refs #340. Fixed SEGFAULT bug of dgemv_n on OSX. 2014-02-15 23:23:15 +08:00
wangqian
2d557eb1e0 Fixed computational error of dgemv_n. 2014-02-04 21:47:51 +08:00
Zhang Xianyi
a789b77b75 Used SwitchToThread for YIELDING on AMD piledriver with Windows. 2014-01-28 16:40:19 +08:00
Zhang Xianyi
75acf96d94 Refs #329 #287. Only disable -fopenmp for LAPACK Fortran codes on Windows. 2014-01-24 15:39:46 +08:00
Zhang Xianyi
8c7687b419 Refs #338. Added OPENBLAS_VERBOSE environment variable on runtime
By default, OpenBLAS doesn't output the warning message. You can set
OPENBLAS_VERBOSE (e.g. export OPENBLAS_VERBOSE=1) to enable the warning
message on runtime.
2014-01-24 02:05:59 +08:00
Zhang Xianyi
3e0a7b931c Refs #333. Detect the wrong parameter for zherk/zher2k. 2014-01-21 01:27:51 +08:00
Zhang Xianyi
306d9f2e35 Fixed #334 a makefile bug in lapacke. 2014-01-19 23:28:11 +08:00
Zhang Xianyi
7b8604ea29 Refs #335. Added the fallback of L2 size detection for some virtual machines. 2014-01-08 11:16:21 +08:00
Zhang Xianyi
ab69443bd4 Refs #332. Added addtional Intel Ivy Bridge and Haswell CPU-id. 2014-01-05 23:44:29 +08:00
Zhang Xianyi
b263e096af Refs #307. Delete debug printf. 2013-12-31 15:53:13 +08:00
Zhang Xianyi
05bb391c3a Refs #330. Fixed the compatible issue with clang on Mac OSX. 2013-12-16 20:31:17 +08:00
Zhang Xianyi
0ab080987d Release 0.2.9 rc1 version. 2013-12-13 20:48:05 +08:00
5037 changed files with 365233 additions and 63137 deletions

8
.gitignore vendored
View File

@@ -15,14 +15,17 @@ lapack-netlib/make.inc
lapack-netlib/lapacke/include/lapacke_mangling.h
lapack-netlib/TESTING/testing_results.txt
*.so
*.so.*
*.a
.svn
*~
lib.grd
nohup.out
config.h
config_kernel.h
Makefile.conf
Makefile.conf_last
Makefile_kernel.conf
config_last.h
getarch
getarch_2nd
@@ -41,6 +44,8 @@ ctest/xzcblat2
ctest/xzcblat3
exports/linktest.c
exports/linux.def
kernel/setparam_*.c
kernel/kernel_*.h
test/CBLAT2.SUMM
test/CBLAT3.SUMM
test/DBLAT2.SUMM
@@ -61,3 +66,6 @@ test/sblat3
test/zblat1
test/zblat2
test/zblat3
build
build.*
*.swp

View File

@@ -1,4 +1,13 @@
language: c
notifications:
webhooks:
urls:
- https://webhooks.gitter.im/e/8a6e4470a0cebd090344
on_success: change # options: [always|never|change] default: always
on_failure: always # options: [always|never|change] default: always
on_start: never # options: [always|never|change] default: always
compiler:
- gcc
@@ -11,11 +20,16 @@ env:
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq gfortran
- sudo apt-get install -qq gfortran
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
script:
- set -e
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
# whitelist
branches:

200
CMakeLists.txt Normal file
View File

@@ -0,0 +1,200 @@
##
## Author: Hank Anderson <hank@statease.com>
##
cmake_minimum_required(VERSION 2.8.4)
project(OpenBLAS)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 2)
set(OpenBLAS_PATCH_VERSION 18)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
enable_language(ASM)
enable_language(C)
if(MSVC)
set(OpenBLAS_LIBNAME libopenblas)
else()
set(OpenBLAS_LIBNAME openblas)
endif()
#######
if(MSVC)
option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
option(BUILD_DEBUG "Build Debug Version" OFF)
#######
if(BUILD_WITHOUT_LAPACK)
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif()
if(BUILD_DEBUG)
set(CMAKE_BUILD_TYPE Debug)
else()
set(CMAKE_BUILD_TYPE Release)
endif()
if(BUILD_WITHOUT_CBLAS)
set(NO_CBLAS 1)
endif()
#######
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake")
include("${CMAKE_SOURCE_DIR}/cmake/system.cmake")
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
if (NOT DYNAMIC_ARCH)
list(APPEND BLASDIRS kernel)
endif ()
if (DEFINED SANITY_CHECK)
list(APPEND BLASDIRS reference)
endif ()
set(SUBDIRS ${BLASDIRS})
if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
endif ()
# set which float types we want to build for
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
# if none are defined, build for all
set(BUILD_SINGLE true)
set(BUILD_DOUBLE true)
set(BUILD_COMPLEX true)
set(BUILD_COMPLEX16 true)
endif ()
set(FLOAT_TYPES "")
if (BUILD_SINGLE)
message(STATUS "Building Single Precision")
list(APPEND FLOAT_TYPES "SINGLE") # defines nothing
endif ()
if (BUILD_DOUBLE)
message(STATUS "Building Double Precision")
list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE
endif ()
if (BUILD_COMPLEX)
message(STATUS "Building Complex Precision")
list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX
endif ()
if (BUILD_COMPLEX16)
message(STATUS "Building Double Complex Precision")
list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE
endif ()
set(SUBDIRS_ALL ${SUBDIRS} test ctest utest exports benchmark ../laswp ../bench)
# all :: libs netlib tests shared
# libs :
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.")
endif ()
if (${NO_STATIC} AND ${NO_SHARED})
message(FATAL_ERROR "Neither static nor shared are enabled.")
endif ()
#Set default output directory
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
set(TARGET_OBJS "")
foreach (SUBDIR ${SUBDIRS})
add_subdirectory(${SUBDIR})
string(REPLACE "/" "_" subdir_obj ${SUBDIR})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:${subdir_obj}>")
endforeach ()
# netlib:
# Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke.
# Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want.
if (NOT NOFORTRAN AND NOT NO_LAPACK)
include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake")
if (NOT NO_LAPACKE)
include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake")
endif ()
endif ()
#Only generate .def for dll on MSVC
if(MSVC)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
endforeach()
enable_testing()
add_subdirectory(utest)
if(NOT MSVC)
#only build shared library for MSVC
add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
if(SMP)
target_link_libraries(${OpenBLAS_LIBNAME} pthread)
target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
endif()
#build test and ctest
add_subdirectory(test)
if(NOT NO_CBLAS)
add_subdirectory(ctest)
endif()
endif()
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
SOVERSION ${OpenBLAS_MAJOR_VERSION}
)
# TODO: Why is the config saved here? Is this necessary with CMake?
#Save the config files for installation
# @cp Makefile.conf Makefile.conf_last
# @cp config.h config_last.h
#ifdef QUAD_PRECISION
# @echo "#define QUAD_PRECISION">> config_last.h
#endif
#ifeq ($(EXPRECISION), 1)
# @echo "#define EXPRECISION">> config_last.h
#endif
###
#ifeq ($(DYNAMIC_ARCH), 1)
# @$(MAKE) -C kernel commonlibs || exit 1
# @for d in $(DYNAMIC_CORE) ; \
# do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
# done
# @echo DYNAMIC_ARCH=1 >> Makefile.conf_last
#endif
#ifdef USE_THREAD
# @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
#endif
# @touch lib.grd

View File

@@ -10,15 +10,28 @@
* Optimize BLAS3 on ICT Loongson 3A.
* Optimize BLAS3 on Intel Sandy Bridge.
* Werner Saar <wernsaar@googlemail.com>
* [2013-03-04] Optimize AVX and FMA4 DGEMM on AMD Bulldozer
* [2013-04-27] Optimize AVX and FMA4 TRSM on AMD Bulldozer
* [2013-06-09] Optimize AVX and FMA4 SGEMM on AMD Bulldozer
* [2013-06-11] Optimize AVX and FMA4 ZGEMM on AMD Bulldozer
* [2013-06-12] Optimize AVX and FMA4 CGEMM on AMD Bulldozer
* [2013-06-16] Optimize dgemv_n kernel on AMD Bulldozer
* [2013-06-20] Optimize ddot, daxpy kernel on AMD Bulldozer
* [2013-06-21] Optimize dcopy kernel on AMD Bulldozer
* Porting and Optimization on ARM Cortex-A9
* Optimization on AMD Piledriver
* Optimization on Intel Haswell
## Previous Developers
* Zaheer Chothia <zaheer.chothia@gmail.com>
* Improve the compatibility about complex number
* Build LAPACKE: C interface to LAPACK
* Improve the windows build.
## Previous Developers
* Chen Shaohu <huhumartinwar@gmail.com>
* Optimize GEMV on the Loongson 3A processor.
* Optimize GEMV on the Loongson 3A processor.
* Luo Wen
* Intern. Test Level-2 BLAS.
@@ -40,11 +53,11 @@ In chronological order:
* [2012-05-19] Fix building bug on FreeBSD and NetBSD.
* Sylvestre Ledru <https://github.com/sylvestre>
* [2012-07-01] Improve the detection of sparc. Fix building bug under
* [2012-07-01] Improve the detection of sparc. Fix building bug under
Hurd and kfreebsd.
* Jameson Nash <https://github.com/vtjnash>
* [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to
* [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to
make on the command line.
* Alexander Nasonov <alnsn@yandex.ru>
@@ -52,16 +65,7 @@ In chronological order:
* Sébastien Villemot <sebastien@debian.org>
* [2012-11-14] Fix compilation with TARGET=GENERIC. Patch applied to Debian package.
* Werner Saar <wernsaar@googlemail.com>
* [2013-03-04] Optimize AVX and FMA4 DGEMM on AMD Bulldozer
* [2013-04-27] Optimize AVX and FMA4 TRSM on AMD Bulldozer
* [2013-06-09] Optimize AVX and FMA4 SGEMM on AMD Bulldozer
* [2013-06-11] Optimize AVX and FMA4 ZGEMM on AMD Bulldozer
* [2013-06-12] Optimize AVX and FMA4 CGEMM on AMD Bulldozer
* [2013-06-16] Optimize dgemv_n kernel on AMD Bulldozer
* [2013-06-20] Optimize ddot, daxpy kernel on AMD Bulldozer
* [2013-06-21] Optimize dcopy kernel on AMD Bulldozer
* [2013-08-28] Avoid failure on qemu guests declaring an Athlon CPU without 3dnow!
* Kang-Che Sung <Explorer09@gmail.com>
* [2013-05-17] Fix typo in the document. Re-order the architecture list in getarch.c.
@@ -76,15 +80,73 @@ In chronological order:
* [2013-06-30] Add Intel Haswell support (using sandybridge optimizations).
* grisuthedragon <https://github.com/grisuthedragon>
* [2013-07-11] create openblas_get_parallel to retrieve information which parallelization
* [2013-07-11] create openblas_get_parallel to retrieve information which parallelization
model is used by OpenBLAS.
* Elliot Saba <staticfloat@gmail.com>
* [2013-07-22] Add in return value for `interface/trtri.c`
* Sébastien Fabbro <bicatali@gentoo.org>
* [2013-07-24] Modify makefile to respect user's LDFLAGS
* [2013-07-24] Add stack markings for GNU as arch-independent for assembler files
* Viral B. Shah <viral@mayin.org>
* [2013-08-21] Patch LAPACK XLASD4.f as discussed in JuliaLang/julia#2340
* Lars Buitinck <https://github.com/larsmans>
* [2013-08-28] get rid of the generated cblas_noconst.h file
* [2013-08-28] Missing threshold in gemm.c
* [2013-08-28] fix default prefix handling in makefiles
* yieldthought <https://github.com/yieldthought>
* [2013-10-08] Remove -Wl,--retain-symbols-file from dynamic link line to fix tool support
* Keno Fischer <https://github.com/loladiro>
* [2013-10-23] Use FC instead of CC to link the dynamic library on OS X
* Christopher Meng <cickumqt@gmail.com>
* [2013-12-09] Add DESTDIR support for easier building on RPM based distros.
Use install command instead of cp to install files with permissions control.
* Lucas Beyer <lucasb.eyer.be@gmail.com>
* [2013-12-10] Added support for NO_SHARED in make install.
* carlkl <https://github.com/carlkl>
* [2013-12-13] Fixed LAPACKE building bug on Windows
* [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes]
* Isaac Dunham <https://github.com/idunham>
* [2014-08-03] Fixed link error on Linux/musl
* Dave Nuechterlein
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
ARMv8 support.
* Jerome Robert <jeromerobert@gmx.com>
* [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
* [2015-12-23] `stack_check` in `gemv.c` (bug #722)
* [2015-12-28] Allow to force the number of parallel make job
* [2015-12-28] Fix detection of AMD E2-3200 detection
* [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
* [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
* [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
* [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
* [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
* Dan Kortschak
* [2015-01-07] Added test for drotmg bug #484.
* Ton van den Heuvel <https://github.com/ton>
* [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity().
* Martin Koehler <https://github.com/grisuthedragon/>
* [2015-09-07] Improved imatcopy
* Ashwin Sekhar T K <https://github.com/ashwinyes/>
* [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
* [2015-11-20] lapack-test fixes for Cortex-A57
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
* theoractice <https://github.com/theoractice/>
* [2016-03-20] Fix compiler error in VisualStudio with CMake
* [2016-03-22] Fix access violation on Windows while static linking

View File

@@ -1,4 +1,263 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.18
12-Apr-2016
common:
* If you set MAKE_NB_JOBS flag less or equal than zero,
make will be without -j.
x86/x86_64:
* Support building Visual Studio static library. (#813, Thanks, theoractice)
* Fix bugs to pass buidbot CI tests (http://build.openblas.net)
ARM:
* Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K)
POWER:
* Optimize S and C BLAS3 on Power8
* Optimize BLAS2/1 on Power8
====================================================================
Version 0.2.17
20-Mar-2016
common:
* Enable BUILD_LAPACK_DEPRECATED=1 by default.
====================================================================
Version 0.2.16
15-Mar-2016
common:
* Avoid potential getenv segfault. (#716)
* Import LAPACK svn bugfix #142-#147,#150-#155
x86/x86_64:
* Optimize c/zgemv for AMD Bulldozer, Piledriver, Steamroller
* Fix bug with scipy linalg test.
ARM:
* Improve DGEMM for ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
POWER:
* Optimize D and Z BLAS3 functions for Power8.
====================================================================
Version 0.2.16.rc1
23-Feb-2016
common:
* Upgrade LAPACK to 3.6.0 version.
Add BUILD_LAPACK_DEPRECATED option in Makefile.rule to build
LAPACK deprecated functions.
* Add MAKE_NB_JOBS option in Makefile.
Force number of make jobs.This is particularly
useful when using distcc. (#735. Thanks, Jerome Robert.)
* Redesign unit test. Run unit/regression test at every build (Travis-CI and Appveyor).
* Disable multi-threading for small size swap and ger. (#744. Thanks, Jerome Robert)
* Improve small zger, zgemv, ztrmv using stack alloction (#727. Thanks, Jerome Robert)
* Let openblas_get_num_threads return the number of active threads.
(#760. Thanks, Jerome Robert)
* Support illumos(OmniOS). (#749. Thanks, Lauri Tirkkonen)
* Fix LAPACK Dormbr, Dormlq bug. (#711, #713. Thanks, Brendan Tracey)
* Update scipy benchmark script. (#745. Thanks, John Kirkham)
x86/x86_64:
* Optimize trsm kernels for AMD Bulldozer, Piledriver, Steamroller.
* Detect Intel Avoton.
* Detect AMD Trinity, Richland, E2-3200.
* Fix gemv performance bug on Mac OSX Intel Haswell.
* Fix some bugs with CMake and Visual Studio
ARM:
* Support and optimize Cortex-A57 AArch64.
(#686. Thanks, Ashwin Sekhar TK)
* Fix Android build on ARMV7 (#778. Thanks, Paul Mustiere)
* Update ARMV6 kernels.
POWER:
* Fix detection of POWER architecture
(#684. Thanks, Sebastien Villemot)
====================================================================
Version 0.2.15
27-Oct-2015
common:
* Support cmake on x86/x86-64. Natively compiling on MS Visual Studio.
(experimental. Thank Hank Anderson for the initial cmake porting work.)
On Linux and Mac OSX, OpenBLAS cmake supports assembly kernels.
e.g. cmake .
make
make test (Optional)
On Windows MS Visual Studio, OpenBLAS cmake only support C kernels.
(OpenBLAS uses AT&T style assembly, which is not supported by MSVC.)
e.g. cmake -G "Visual Studio 12 Win64" .
Open OpenBLAS.sln and build.
* Enable MAX_STACK_ALLOC flags by default.
Improve ger and gemv for small matrices.
* Improve gemv parallel with small m and large n case.
* Improve ?imatcopy when lda==ldb (#633. Thanks, Martin Koehler)
* Add vecLib benchmarks (#565. Thanks, Andreas Noack.)
* Fix LAPACK lantr for row major matrices (#634. Thanks, Dan Kortschak)
* Fix LAPACKE lansy (#640. Thanks, Dan Kortschak)
* Import bug fixes for LAPACKE s/dormlq, c/zunmlq
* Raise the signal when pthread_create fails (#668. Thanks, James K. Lowden)
* Remove g77 from compiler list.
* Enable AppVeyor Windows CI.
x86/x86-64:
* Support pure C generic kernels for x86/x86-64.
* Support Intel Boardwell and Skylake by Haswell kernels.
* Support AMD Excavator by Steamroller kernels.
* Optimize s/d/c/zdot for Intel SandyBridge and Haswell.
* Optimize s/d/c/zdot for AMD Piledriver and Steamroller.
* Optimize s/d/c/zapxy for Intel SandyBridge and Haswell.
* Optimize s/d/c/zapxy for AMD Piledriver and Steamroller.
* Optimize d/c/zscal for Intel Haswell, dscal for Intel SandyBridge.
* Optimize d/c/zscal for AMD Bulldozer, Piledriver and Steamroller.
* Optimize s/dger for Intel SandyBridge.
* Optimize s/dsymv for Intel SandyBridge.
* Optimize ssymv for Intel Haswell.
* Optimize dgemv for Intel Nehalem and Haswell.
* Optimize dtrmm for Intel Haswell.
ARM:
* Support Android NDK armeabi-v7a-hard ABI (-mfloat-abi=hard)
e.g. make HOSTCC=gcc CC=arm-linux-androideabi-gcc NO_LAPACK=1 TARGET=ARMV7
* Fix lock, rpcc bugs (#616, #617. Thanks, Grazvydas Ignotas)
POWER:
* Support ppc64le platform (ELF ABI v2. #612. Thanks, Matthew Brandyberry.)
* Support POWER7/8 by POWER6 kernels. (#612. Thanks, Fábio Perez.)
====================================================================
Version 0.2.14
24-Mar-2015
common:
* Improve OpenBLASConfig.cmake. (#474, #475. Thanks, xantares.)
* Improve ger and gemv for small matrices by stack allocation.
e.g. make -DMAX_STACK_ALLOC=2048 (#482. Thanks, Jerome Robert.)
* Introduce openblas_get_num_threads and openblas_get_num_procs.
(#497. Thanks, Erik Schnetter.)
* Add ATLAS-style ?geadd function. (#509. Thanks, Martin Köhler.)
* Fix c/zsyr bug with negative incx. (#492.)
* Fix race condition during shutdown causing a crash in
gotoblas_set_affinity(). (#508. Thanks, Ton van den Heuvel.)
x86/x86-64:
* Support AMD Streamroller.
ARM:
* Add Cortex-A9 and Cortex-A15 targets.
====================================================================
Version 0.2.13
3-Dec-2014
common:
* Add SYMBOLPREFIX and SYMBOLSUFFIX makefile options
for adding a prefix or suffix to all exported symbol names
in the shared library.(#459, Thanks Tony Kelman)
* Provide OpenBLASConfig.cmake at installation.
* Fix Fortran compiler detection on FreeBSD.
(#470, Thanks Mike Nolta)
x86/x86-64:
* Add generic kernel files for x86-64. make TARGET=GENERIC
* Fix a bug of sgemm kernel on Intel Sandy Bridge.
* Fix c_check bug on some amd64 systems. (#471, Thanks Mike Nolta)
ARM:
* Support APM's X-Gene 1 AArch64 processors.
Optimize trmm and sgemm. (#465, Thanks Dave Nuechterlein)
====================================================================
Version 0.2.12
13-Oct-2014
common:
* Added CBLAS interface for ?omatcopy and ?imatcopy.
* Enable ?gemm3m functions.
* Added benchmark for ?gemm3m.
* Optimized multithreading lower limits.
* Disabled SYMM3M and HEMM3M functions
because of segment violations.
x86/x86-64:
* Improved axpy and symv performance on AMD Bulldozer.
* Improved gemv performance on modern Intel and AMD CPUs.
====================================================================
Version 0.2.11
18-Aug-2014
common:
* Added some benchmark codes.
* Fix link error on Linux/musl.(Thanks Isaac Dunham)
x86/x86-64:
* Improved s/c/zgemm performance for Intel Haswell.
* Improved s/d/c/zgemv performance.
* Support the big numa machine.(EXPERIMENT)
ARM:
* Fix detection when cpuinfo uses "Processor". (Thanks Isaiah)
====================================================================
Version 0.2.10
16-Jul-2014
common:
* Added BLAS extensions as following.
s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy.
* Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34)
* Added NO_AVX2 flag for old binutils. (#401)
* Support outputing the CPU corename on runtime.(#407)
* Patched LAPACK to fix bug 114, 117, 118.
(http://www.netlib.org/lapack/bug_list.html)
* Disabled ?gemm3m for a work-around fix. (#400)
x86/x86-64:
* Fixed lots of bugs for optimized kernels on sandybridge,Haswell,
bulldozer, and piledriver.
https://github.com/xianyi/OpenBLAS/wiki/Fixed-optimized-kernels-To-do-List
ARM:
* Improved LAPACK testing.
====================================================================
Version 0.2.9
10-Jun-2014
common:
* Improved the result for LAPACK testing. (#372)
* Installed DLL to prefix/bin instead of prefix/lib. (#366)
* Build import library on Windows.(#374)
x86/x86-64:
* To improve LAPACK testing, we fallback some kernels. (#372)
https://github.com/xianyi/OpenBLAS/wiki/Fixed-optimized-kernels-To-do-List
====================================================================
Version 0.2.9.rc2
06-Mar-2014
common:
* Added OPENBLAS_VERBOSE environment variable.(#338)
* Make OpenBLAS thread-pool resilient to fork via pthread_atfork.
(#294, Thank Olivier Grisel)
* Rewrote rotmg
* Fixed sdsdot bug.
x86/x86-64:
* Detect Intel Haswell for new Macbook.
====================================================================
Version 0.2.9.rc1
13-Jan-2013
common:
* Update LAPACK to 3.5.0 version
* Fixed compatiable issues with Clang and Pathscale compilers.
x86/x86-64:
* Optimization on Intel Haswell.
* Enable optimization kernels on AMD Bulldozer and Piledriver.
ARM:
* Support ARMv6 and ARMv7 ISA.
* Optimization on ARM Cortex-A9.
====================================================================
Version 0.2.8
01-Aug-2013
@@ -17,25 +276,25 @@ Version 0.2.7
common:
* Support LSB (Linux Standard Base) 4.1.
e.g. make CC=lsbcc
* Include LAPACK 3.4.2 source codes to the repo.
* Include LAPACK 3.4.2 source codes to the repo.
Avoid downloading at compile time.
* Add NO_PARALLEL_MAKE flag to disable parallel make.
* Create openblas_get_parallel to retrieve information which
* Create openblas_get_parallel to retrieve information which
parallelization model is used by OpenBLAS. (Thank grisuthedragon)
* Detect LLVM/Clang compiler. The default compiler is Clang on Mac OS X.
* Change LIBSUFFIX from .lib to .a on windows.
* A walk round for dtrti_U single thread bug. Replace it with LAPACK codes. (#191)
* A work-around for dtrti_U single thread bug. Replace it with LAPACK codes. (#191)
x86/x86-64:
* Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on
* Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on
AMD Bulldozer. (Thank Werner Saar)
* Add Intel Haswell support (using Sandybridge optimizations).
(Thank Dan Luu)
* Add AMD Piledriver support (using Bulldozer optimizations).
* Fix the computational error in zgemm avx kernel on
* Fix the computational error in zgemm avx kernel on
Sandybridge. (#237)
* Fix the overflow bug in gemv.
* Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS
* Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS
is very large.(#214, #221, #246).
MIPS64:
* Support loongcc (Open64 based) compiler for ICT Loongson 3A/B.
@@ -72,7 +331,7 @@ common:
* Fixed NetBSD build. (#155)
* Fixed compilation with TARGET=GENERIC. (#160)
x86/x86-64:
* Restore the original CPU affinity when calling
* Restore the original CPU affinity when calling
openblas_set_num_threads(1) (#153)
* Fixed a SEGFAULT bug in dgemv_t when m is very large.(#154)
MIPS64:
@@ -82,13 +341,13 @@ Version 0.2.4
8-Oct-2012
common:
* Upgraded LAPACK to 3.4.2 version. (#145)
* Provided support for passing CFLAGS, FFLAGS, PFLAGS,
* Provided support for passing CFLAGS, FFLAGS, PFLAGS,
FPFLAGS to make. (#137)
* f77blas.h:compatibility for compilers without C99 complex
* f77blas.h:compatibility for compilers without C99 complex
number support. (#141)
x86/x86-64:
* Added NO_AVX flag. Check OS supporting AVX on runtime. (#139)
* Fixed zdot incompatibility ABI issue with GCC 4.7 on
* Fixed zdot incompatibility ABI issue with GCC 4.7 on
Windows 32-bit. (#140)
MIPS64:
* Fixed the generation of shared library bug.
@@ -98,14 +357,14 @@ Version 0.2.3
20-Aug-2012
common:
* Fixed LAPACK unstable bug about ?laswp. (#130)
* Fixed the shared library bug about unloading the library on
* Fixed the shared library bug about unloading the library on
Linux (#132).
* Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2)
Please use gcc and IBM xlf. (#134)
x86/x86-64:
* Supported goto_set_num_threads and openblas_set_num_threads
* Supported goto_set_num_threads and openblas_set_num_threads
APIs in Windows. They can set the number of threads on runtime.
====================================================================
Version 0.2.2
6-July-2012
@@ -153,14 +412,14 @@ x86/x86_64:
* Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX.
* Test alpha=Nan in dscale.
* Fixed a SEGFAULT bug in samax on x86 windows.
====================================================================
Version 0.1.0
23-Mar-2012
common:
* Set soname of shared library on Linux.
* Added LIBNAMESUFFIX flag in Makefile.rule. The user can use
this flag to control the library name, e.g. libopenblas.a,
* Added LIBNAMESUFFIX flag in Makefile.rule. The user can use
this flag to control the library name, e.g. libopenblas.a,
libopenblas_ifort.a or libopenblas_omp.a.
* Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule.
The lib use single thread in GEMM function with small matrices.
@@ -191,7 +450,7 @@ x86/x86_64:
Version 0.1 alpha2.4
18-Sep-2011
common:
* Fixed a bug about installation. The header file "fblas77.h"
* Fixed a bug about installation. The header file "fblas77.h"
works fine now.
* Fixed #61 a building bug about setting TARGET and DYNAMIC_ARCH.
* Try to handle absolute path of shared library in OSX. (#57)
@@ -200,16 +459,16 @@ common:
$(PREFIX)/lib
x86/x86_64:
* Fixed #58 zdot/xdot SEGFAULT bug with GCC-4.6 on x86. According
to i386 calling convention, The callee should remove the first
hidden parameter.Thank Mr. John for this patch.
* Fixed #58 zdot/xdot SEGFAULT bug with GCC-4.6 on x86. According
to i386 calling convention, The callee should remove the first
hidden parameter.Thank Mr. John for this patch.
====================================================================
Version 0.1 alpha2.3
5-Sep-2011
x86/x86_64:
* Added DTB_ENTRIES into dynamic arch setting parameters. Now,
* Added DTB_ENTRIES into dynamic arch setting parameters. Now,
it can read DTB_ENTRIES on runtime. (Refs issue #55 on github)
====================================================================
@@ -217,7 +476,7 @@ Version 0.1 alpha2.2
14-Jul-2011
common:
* Fixed a building bug when DYNAMIC_ARCH=1 & INTERFACE64=1.
* Fixed a building bug when DYNAMIC_ARCH=1 & INTERFACE64=1.
(Refs issue #44 on github)
====================================================================
@@ -225,7 +484,7 @@ Version 0.1 alpha2.1
28-Jun-2011
common:
* Stop the build and output the error message when detecting
* Stop the build and output the error message when detecting
fortran compiler failed. (Refs issue #42 on github)
====================================================================
@@ -233,16 +492,16 @@ Version 0.1 alpha2
23-Jun-2011
common:
* Fixed blasint undefined bug in <cblas.h> file. Other software
* Fixed blasint undefined bug in <cblas.h> file. Other software
could include this header successfully(Refs issue #13 on github)
* Fixed the SEGFAULT bug on 64 cores. On SMP server, the number
of CPUs or cores should be less than or equal to 64.(Refs issue #14
* Fixed the SEGFAULT bug on 64 cores. On SMP server, the number
of CPUs or cores should be less than or equal to 64.(Refs issue #14
on github)
* Support "void goto_set_num_threads(int num_threads)" and "void
openblas_set_num_threads(int num_threads)" when USE_OPENMP=1
* Added extern "C" to support C++. Thank Tasio for the patch(Refs
* Added extern "C" to support C++. Thank Tasio for the patch(Refs
issue #21 on github)
* Provided an error message when the arch is not supported.(Refs
* Provided an error message when the arch is not supported.(Refs
issue #19 on github)
* Fixed issue #23. Fixed a bug of f_check script about generating link flags.
* Added openblas_set_num_threads for Fortran.
@@ -257,10 +516,10 @@ x86/x86_64:
* Fixed #28 a wrong result of dsdot on x86_64.
* Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6.
* Fixed #33 ztrmm bug on Nehalem.
* Walk round #27 the low performance axpy issue with small imput size & multithreads.
* Work-around #27 the low performance axpy issue with small imput size & multithreads.
MIPS64:
* Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
* Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
* Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2)
* Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3)
@@ -269,9 +528,9 @@ Version 0.1 alpha1
20-Mar-2011
common:
* Support "make NO_LAPACK=1" to build the library without
* Support "make NO_LAPACK=1" to build the library without
LAPACK functions.
* Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34.
* Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34.
Thank Mr.Ei-ji Nakama providing this patch. (Refs issue #12 on github)
* Added DEBUG=1 rule in Makefile.rule to build debug version.
* Disable compiling quad precision in reference BLAS library(netlib BLAS).
@@ -280,15 +539,15 @@ common:
* Imported GotoBLAS2 1.13 BSD version
x86/x86_64:
* On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue
zdotu & zdotc failures.Instead,Walk around it. (Refs issue #8 #9 on github)
* Modified ?axpy functions to return same netlib BLAS results
* On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue
zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github)
* Modified ?axpy functions to return same netlib BLAS results
when incx==0 or incy==0 (Refs issue #7 on github)
* Modified ?swap functions to return same netlib BLAS results
* Modified ?swap functions to return same netlib BLAS results
when incx==0 or incy==0 (Refs issue #6 on github)
* Modified ?rot functions to return same netlib BLAS results
* Modified ?rot functions to return same netlib BLAS results
when incx==0 or incy==0 (Refs issue #4 on github)
* Detect Intel Westmere,Intel Clarkdale and Intel Arrandale
* Detect Intel Westmere,Intel Clarkdale and Intel Arrandale
to use Nehalem codes.
* Fixed a typo bug about compiling dynamic ARCH library.
MIPS64:

View File

@@ -83,7 +83,7 @@
4. Suported precision
Now x86/x86_64 version support 80bit FP precision in addition to
normal double presicion and single precision. Currently only
normal double presicion and single precision. Currently only
gfortran supports 80bit FP with "REAL*10".

View File

@@ -32,9 +32,9 @@
GotoBLAS2 build complete.
OS ... Linux
Architecture ... x86_64
BINARY ... 64bit
OS ... Linux
Architecture ... x86_64
BINARY ... 64bit
C compiler ... GCC (command line : gcc)
Fortran compiler ... PATHSCALE (command line : pathf90)
Library Name ... libgoto_barcelonap-r1.27.a (Multi threaded; Max

View File

@@ -56,7 +56,7 @@
1.6 Q I use OpenMP compiler. How can I use GotoBLAS2 with it?
A Please understand that OpenMP is a compromised method to use
A Please understand that OpenMP is a compromised method to use
thread. If you want to use OpenMP based code with GotoBLAS2, you
should enable "USE_OPENMP=1" in Makefile.rule.

View File

@@ -9,10 +9,10 @@
If you want to allocate 64 large pages,
$shell> echo 0 > /pros/sys/vm/nr_hugepages # need to be reset
$shell> echo 65 > /pros/sys/vm/nr_hugepages # add 1 extra page
$shell> echo 3355443200 > /pros/sys/kernel/shmmax # just large number
$shell> echo 3355443200 > /pros/sys/kernel/shmall
$shell> echo 0 > /proc/sys/vm/nr_hugepages # need to be reset
$shell> echo 65 > /proc/sys/vm/nr_hugepages # add 1 extra page
$shell> echo 3355443200 > /proc/sys/kernel/shmmax # just large number
$shell> echo 3355443200 > /proc/sys/kernel/shmall
Also may add a few lines into /etc/security/limits.conf file.
@@ -43,7 +43,7 @@
F) Other aarchitecture which doesn't have Large TLB enhancement
If you have root permission, please install device driver which
located in drivers/mapper.
located in drivers/mapper.
$shell> cd drivers/mapper
$shell> make

View File

@@ -4,7 +4,7 @@
probably you created too many threads or process. Basically GotoBLAS
assumes that available cores that you specify are exclusively for
BLAS computation. Even one small thread/process conflicts with BLAS
threads, performance will become worse.
threads, performance will become worse.
The best solution is to reduce your number of threads or insert
some synchronization mechanism and suspend your threads until BLAS
@@ -19,4 +19,4 @@
Anyway, if you see any weird performance loss, it means your code or
algorithm is not optimal.
algorithm is not optimal.

27
LICENSE
View File

@@ -1,4 +1,4 @@
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
Copyright (c) 2011-2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -12,17 +12,18 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

125
Makefile
View File

@@ -4,11 +4,7 @@ include ./Makefile.system
BLASDIRS = interface driver/level2 driver/level3 driver/others
ifneq ($(DYNAMIC_ARCH), 1)
BLASDIRS += kernel
endif
ifdef UTEST_CHECK
SANITY_CHECK = 1
BLASDIRS += kernel
endif
ifdef SANITY_CHECK
@@ -20,10 +16,12 @@ ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
.PHONY : all libs netlib test ctest shared install
.NOTPARALLEL : all libs prof lapack-test install
.NOTPARALLEL : all libs prof lapack-test install blas-test
all :: libs netlib tests shared
@echo
@@ -36,9 +34,13 @@ ifndef BINARY64
else
@echo " BINARY ... 64bit "
endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
endif
endif
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
@@ -57,7 +59,7 @@ endif
ifeq ($(USE_OPENMP), 1)
@echo
@echo " Use OpenMP in the multithreading. Becasue of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, "
@echo " Use OpenMP in the multithreading. Because of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, "
@echo " you should use OMP_NUM_THREADS environment variable to control the number of threads."
@echo
endif
@@ -79,22 +81,22 @@ endif
shared :
ifndef NO_SHARED
ifeq ($(OSNAME), Linux)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll
@@ -111,10 +113,8 @@ ifndef CROSS
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
ifdef UTEST_CHECK
$(MAKE) -C utest all
endif
endif
ifndef NO_CBLAS
$(MAKE) -C ctest all
endif
@@ -127,7 +127,12 @@ ifeq ($(CORE), UNKOWN)
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
endif
ifeq ($(NOFORTRAN), 1)
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
$(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.)
endif
ifeq ($(NO_STATIC), 1)
ifeq ($(NO_SHARED), 1)
$(error OpenBLAS: neither static nor shared are enabled.)
endif
endif
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
@for d in $(SUBDIRS) ; \
@@ -144,7 +149,7 @@ endif
ifeq ($(EXPRECISION), 1)
@echo "#define EXPRECISION">> config_last.h
endif
##
##
ifeq ($(DYNAMIC_ARCH), 1)
@$(MAKE) -C kernel commonlibs || exit 1
@for d in $(DYNAMIC_CORE) ; \
@@ -178,7 +183,7 @@ blas :
fi; \
done
hpl :
hpl :
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(BLASDIRS) ../laswp exports ; \
do if test -d $$d; then \
@@ -201,12 +206,13 @@ hpl_p :
done
ifeq ($(NO_LAPACK), 1)
netlib :
netlib :
else
netlib : lapack_prebuild
ifndef NOFORTRAN
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
endif
ifndef NO_LAPACKE
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
@@ -221,7 +227,7 @@ ifndef NOFORTRAN
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "NOOPT = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -230,46 +236,35 @@ ifndef NOFORTRAN
-@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
ifeq ($(F_COMPILER), GFORTRAN)
-@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc
ifdef SMP
ifeq ($(OSNAME), WINNT)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
else
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
else
-@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
ifeq ($(BUILD_LAPACK_DEPRECATED), 1)
-@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
endif
lapack-3.4.2 : lapack-3.4.2.tgz
ifndef NOFORTRAN
ifndef NO_LAPACK
@if test `$(MD5SUM) $< | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \
echo $(TAR) zxf $< ;\
$(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\
rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\
else \
rm -rf $(NETLIB_LAPACK_DIR) ;\
echo " Cannot download lapack-3.4.2.tgz or the MD5 check sum is wrong (Please use orignal)."; \
exit 1; \
fi
endif
endif
LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.2.tgz
lapack-3.4.2.tgz :
ifndef NOFORTRAN
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD))
curl -O $(LAPACK_URL);
else
ifeq ($(OSNAME), FreeBSD)
fetch $(LAPACK_URL);
else
wget -O $@ $(LAPACK_URL);
endif
endif
endif
large.tgz :
large.tgz :
ifndef NOFORTRAN
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz;
@@ -287,17 +282,30 @@ lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
make -C $(NETLIB_LAPACK_DIR) tmglib
make -C $(NETLIB_LAPACK_DIR)/TIMING
endif
lapack-test :
$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc
@rm -f $(NETLIB_LAPACK_DIR)/TESTING/*.out
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING
$(GREP) failed $(NETLIB_LAPACK_DIR)/TESTING/*.out
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
endif
lapack-runtest:
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
dummy :
@@ -323,4 +331,5 @@ endif
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
@rm -f *.grd Makefile.conf_last config_last.h
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
@echo Done.

View File

@@ -50,7 +50,7 @@ endif
ifndef SMP
LIBCXML = -lcxml -lots -lm
LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm
LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm
else
LIBCXML = -lcxmlp -lots -lm
LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm

View File

@@ -1,8 +1,23 @@
ifeq ($(CORE), ARMV7)
# ifeq logical or
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV7)
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
@@ -10,3 +25,7 @@ FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -marm -march=armv5
FCOMMON_OPT += -marm -march=armv5
endif

View File

@@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a
endif
ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
endif

View File

@@ -16,7 +16,7 @@ LIBMLIB = ../../level1/others/libmisc.a -L/opt/intel/fc/ia64/9.1.040/lib -L/opt
LIBSCSL = -L/opt/scsl/1.4.1.0/lib -Wl,-rpath,/opt/scsl/1.4.1.0/lib -lscs
ifndef SMP
LIBATLAS = -L/usr/lib/atlas3.6.0 -lf77blas -latlas -lm
LIBATLAS = -L/usr/lib/atlas3.6.0 -lf77blas -latlas -lm
else
LIBATLAS = -L$(HOME)/misc/lib -L/usr/lib/atlas3.6.0p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm
endif

View File

@@ -7,7 +7,11 @@ PREFIX ?= /opt/OpenBLAS
OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
OPENBLAS_BINARY_DIR := $(PREFIX)/bin
OPENBLAS_BUILD_DIR := $(CURDIR)
OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
.PHONY : install
.NOTPARALLEL : install
@@ -19,11 +23,13 @@ install : lib.grd
@-mkdir -p $(DESTDIR)$(PREFIX)
@-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
#for inc
#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@awk '{print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@@ -41,22 +47,24 @@ ifndef NO_CBLAS
endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
endif
#for install static library
#for install static library
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
#for install shared library
endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), Linux)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
@@ -72,18 +80,50 @@ ifeq ($(OSNAME), NetBSD)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
@-ln -fs $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
endif
ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
endif
endif
#Generating OpenBLASConfig.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
endif
ifeq ($(OSNAME), Darwin)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
endif
else
#only static
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
endif
#Generating OpenBLASConfigVersion.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo Install OK!

View File

@@ -5,7 +5,7 @@ FLAMEPATH = $(HOME)/flame/lib
#ifeq ($(CORE), CELL)
#CELL_SDK_ROOT = /opt/IBM/cell-sdk-1.1/sysroot/usr
#SPU_CC = spu-gcc
#EXTRALIB += -lspe
#EXTRALIB += -lspe
#endif
ifeq ($(OSNAME), Linux)
@@ -38,7 +38,7 @@ ASFLAGS = -a32
endif
endif
# CCOMMON_OPT += -maltivec -mabi=altivec
# CCOMMON_OPT += -maltivec -mabi=altivec
LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame-lapack -lflame-base $(LIBS)
@@ -57,7 +57,7 @@ endif
LIBVECLIB = -framework VecLib
ifndef SMP
LIBATLAS = -L/usr/lib/atlas3.7.11 -lf77blas -latlas -lg2c -lm
LIBATLAS = -L/usr/lib/atlas3.7.11 -lf77blas -latlas -lg2c -lm
LIBESSL = -lessl $(ESSLPATH) ../../level1/others/libmisc.a -lm
else
LIBATLAS = -L/usr/lib/atlas3.7.11p -lptf77blas -latlas -lm -lpthread
@@ -73,7 +73,7 @@ endif
LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib
ifndef SMP
LIBATLAS = -L/usr/lib64/atlas3.7.11 -lf77blas -latlas -lg2c -lm
LIBATLAS = -L/usr/lib64/atlas3.7.11 -lf77blas -latlas -lg2c -lm
LIBESSL = -lessl $(ESSLPATH) -lm
else
LIBATLAS = -L/usr/lib64/atlas3.7.11p -lptf77blas -latlas -lm -lpthread

View File

@@ -1,12 +1,12 @@
#
# Beginning of user configuration
# Beginning of user configuration
#
# This library's version
VERSION = 0.2.8
VERSION = 0.2.18
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
# is libopenblas_$(LIBNAMESUFFIX).so.0.
# LIBNAMESUFFIX = omp
@@ -25,9 +25,20 @@ VERSION = 0.2.8
# FC = gfortran
# Even you can specify cross compiler. Meanwhile, please set HOSTCC.
# cross compiler for Windows
# CC = x86_64-w64-mingw32-gcc
# FC = x86_64-w64-mingw32-gfortran
# cross compiler for 32bit ARM
# CC = arm-linux-gnueabihf-gcc
# FC = arm-linux-gnueabihf-gfortran
# cross compiler for 64bit ARM
# CC = aarch64-linux-gnu-gcc
# FC = aarch64-linux-gnu-gfortran
# If you use the cross compiler, please set this host compiler.
# HOSTCC = gcc
@@ -48,23 +59,29 @@ VERSION = 0.2.8
# automatically detected by the the script.
# NUM_THREADS = 24
# if you don't need to install the static library, please comment it in.
# NO_STATIC = 1
# if you don't need generate the shared library, please comment it in.
# NO_SHARED = 1
# If you don't need CBLAS interface, please comment it in.
# NO_CBLAS = 1
# If you only want CBLAS interface without installing Fortran compiler,
# If you only want CBLAS interface without installing Fortran compiler,
# please comment it in.
# ONLY_CBLAS = 1
# If you don't need LAPACK, please comment it in.
# If you don't need LAPACK, please comment it in.
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
# NO_LAPACK = 1
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
# NO_LAPACKE = 1
# Build LAPACK Deprecated functions since LAPACK 3.6.0
BUILD_LAPACK_DEPRECATED = 1
# If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
@@ -76,18 +93,31 @@ VERSION = 0.2.8
# Unfortunately most of kernel won't give us high quality buffer.
# BLAS tries to find the best region before entering main function,
# but it will consume time. If you don't like it, you can disable one.
# NO_WARMUP = 1
NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux.
# NO_AFFINITY = 1
NO_AFFINITY = 1
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
# and OS. However, the performance is low.
# NO_AVX = 1
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
# NO_AVX2 = 1
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
# Force number of make jobs. The default is the number of logical CPU of the host.
# This is particularly useful when using distcc.
# A negative value will disable adding a -j flag to make, allowing to use a parent
# make -j value. This is useful to call OpenBLAS make from an other project
# makefile
# MAKE_NB_JOBS = 2
# If you would like to know minute performance report of GotoBLAS.
# FUNCTION_PROFILE = 1
@@ -109,8 +139,8 @@ VERSION = 0.2.8
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 4.
# GEMM_MULTITHREAD_THRESHOLD = 4
@@ -118,23 +148,40 @@ VERSION = 0.2.8
# slow (Not implemented yet).
# SANITY_CHECK = 1
# Run testcases in utest/ . When you enable UTEST_CHECK, it would enable
# SANITY_CHECK to compare the result with reference BLAS.
# UTEST_CHECK = 1
# The installation directory.
# PREFIX = /opt/OpenBLAS
# Common Optimization Flag;
# Common Optimization Flag;
# The default -O2 is enough.
# COMMON_OPT = -O2
# gfortran option for LAPACK
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
# FCOMMON_OPT = -frecursive
# Profiling flags
COMMON_PROF = -pg
# Build Debug version
# DEBUG = 1
# Set maximum stack allocation.
# The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV
# performance. For details, https://github.com/xianyi/OpenBLAS/pull/482
#
# End of user configuration
# MAX_STACK_ALLOC = 0
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoid conflicts with other BLAS libraries, especially when using
# 64 bit integer interfaces in OpenBLAS.
# For details, https://github.com/xianyi/OpenBLAS/pull/459
#
# The same prefix and suffix are also added to the library name,
# i.e. you get lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) rather than libopenblas
#
# SYMBOLPREFIX=
# SYMBOLSUFFIX=
#
# End of user configuration
#

View File

@@ -27,7 +27,7 @@ LIBNAME = $(LIBPREFIX).a
ifndef SMP
LIBCXML = -L/opt/SUNWspro/lib/v9
LIBATLAS = -L$(HOME)/misc/lib -lf77blas -latlas -lm
LIBATLAS = -L$(HOME)/misc/lib -lf77blas -latlas -lm
else
LIBCXML = -lcxmlp -lots -lm
endif

View File

@@ -23,6 +23,7 @@ CC = gcc
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
endif
endif
@@ -35,7 +36,7 @@ include $(TOPDIR)/$(MAKEFILE_RULE)
endif
#
# Beginning of system configuration
# Beginning of system configuration
#
ifndef HOSTCC
@@ -46,25 +47,85 @@ ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET)
endif
# Force fallbacks for 32bit
ifeq ($(BINARY), 32)
ifeq ($(TARGET), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), BULLDOZER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), STEAMROLLER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), EXCAVATOR)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
#
ifdef TARGET_CORE
GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
endif
# Force fallbacks for 32bit
ifeq ($(BINARY), 32)
ifeq ($(TARGET_CORE), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), BULLDOZER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), STEAMROLLER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), EXCAVATOR)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
GETARCH_FLAGS += -DUSE64BITINT
endif
endif
ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=4
endif
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
ifeq ($(NO_AVX), 1)
GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(NO_AVX2), 1)
GETARCH_FLAGS += -DNO_AVX2
endif
ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g
endif
@@ -78,6 +139,10 @@ NO_PARALLEL_MAKE=0
endif
GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)
ifdef MAKE_NB_JOBS
GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS)
endif
ifeq ($(HOSTCC), loongcc)
GETARCH_FLAGS += -static
endif
@@ -138,13 +203,21 @@ LD = $(CROSS_SUFFIX)ld
RANLIB = $(CROSS_SUFFIX)ranlib
NM = $(CROSS_SUFFIX)nm
DLLWRAP = $(CROSS_SUFFIX)dllwrap
OBJCOPY = $(CROSS_SUFFIX)objcopy
OBJCONV = $(CROSS_SUFFIX)objconv
# For detect fortran failed, only build BLAS.
ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1
endif
#
# OS dependent settings
#
ifeq ($(OSNAME), Darwin)
export MACOSX_DEPLOYMENT_TARGET=10.2
export MACOSX_DEPLOYMENT_TARGET=10.6
MD5SUM = md5 -r
endif
@@ -158,6 +231,7 @@ endif
ifeq ($(OSNAME), Linux)
EXTRALIB += -lm
NO_EXPRECISION = 1
endif
ifeq ($(OSNAME), AIX)
@@ -185,14 +259,14 @@ GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1)
# GCC Majar version > 4
# It is compatible with MSVC ABI.
# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
ifeq ($(GCCVERSIONGTEQ4), 1)
ifeq ($(GCCMINORVERSIONGTEQ7), 1)
# GCC Version >=4.7
# It is compatible with MSVC ABI.
# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
endif
@@ -222,12 +296,14 @@ endif
ifneq ($(OSNAME), WINNT)
ifneq ($(OSNAME), CYGWIN_NT)
ifneq ($(OSNAME), Interix)
ifneq ($(OSNAME), Android)
ifdef SMP
EXTRALIB += -lpthread
endif
endif
endif
endif
endif
# ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix))
@@ -254,6 +330,11 @@ ifdef SANITY_CHECK
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
endif
MAX_STACK_ALLOC ?= 2048
ifneq ($(MAX_STACK_ALLOC), 0)
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
#
# Architecture dependent settings
#
@@ -262,6 +343,11 @@ ifeq ($(ARCH), x86)
ifndef BINARY
NO_BINARY_MODE = 1
endif
ifeq ($(CORE), generic)
NO_EXPRECISION = 1
endif
ifndef NO_EXPRECISION
ifeq ($(F_COMPILER), GFORTRAN)
# ifeq logical or. GCC or LSB
@@ -272,7 +358,7 @@ FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
@@ -280,6 +366,11 @@ endif
endif
ifeq ($(ARCH), x86_64)
ifeq ($(CORE), generic)
NO_EXPRECISION = 1
endif
ifndef NO_EXPRECISION
ifeq ($(F_COMPILER), GFORTRAN)
# ifeq logical or. GCC or LSB
@@ -290,7 +381,7 @@ FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
@@ -301,7 +392,14 @@ ifeq ($(C_COMPILER), INTEL)
CCOMMON_OPT += -wd981
endif
ifeq ($(USE_OPENMP), 1)
#check
ifeq ($(USE_THREAD), 0)
$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
endif
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
CCOMMON_OPT += -fopenmp
@@ -335,15 +433,15 @@ ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
endif
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL
endif
endif
@@ -413,12 +511,12 @@ endif
BINARY_DEFINED = 1
endif
ifeq ($(CORE), LOONGSON3A)
ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(CORE), LOONGSON3B)
ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
@@ -488,7 +586,7 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
EXTRALIB += -lgfortran
endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64)
@@ -502,13 +600,15 @@ else
ifdef BINARY64
FCOMMON_OPT += -m64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -fdefault-integer-8
endif
endif
else
FCOMMON_OPT += -m32
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -fopenmp
endif
endif
@@ -516,16 +616,18 @@ endif
ifeq ($(F_COMPILER), INTEL)
CCOMMON_OPT += -DF_INTERFACE_INTEL
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
ifdef USE_OPENMP
endif
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
ifeq ($(F_COMPILER), FUJITSU)
CCOMMON_OPT += -DF_INTERFACE_FUJITSU
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
@@ -536,12 +638,14 @@ CCOMMON_OPT += -DF_INTERFACE_IBM
ifdef BINARY64
FCOMMON_OPT += -q64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -qintsize=8
endif
endif
else
FCOMMON_OPT += -q32
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
@@ -551,13 +655,15 @@ CCOMMON_OPT += -DF_INTERFACE_PGI
COMMON_PROF += -DPGICOMPILER
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
FCOMMON_OPT += -tp p7-64
else
FCOMMON_OPT += -tp p7
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
endif
@@ -566,9 +672,11 @@ ifeq ($(F_COMPILER), PATHSCALE)
CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
endif
ifneq ($(ARCH), mips64)
ifndef BINARY64
@@ -584,7 +692,7 @@ FCOMMON_OPT += -mabi=n32
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
endif
@@ -593,9 +701,11 @@ ifeq ($(F_COMPILER), OPEN64)
CCOMMON_OPT += -DF_INTERFACE_OPEN64
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
endif
ifeq ($(ARCH), mips64)
ifndef BINARY64
@@ -603,11 +713,11 @@ FCOMMON_OPT += -n32
else
FCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
ifeq ($(CORE), LOONGSON3A)
FCOMMON_OPT += -loongson3 -static
endif
ifeq ($(CORE), LOONGSON3B)
ifeq ($(CORE), LOONGSON3B)
FCOMMON_OPT += -loongson3 -static
endif
@@ -619,7 +729,7 @@ FCOMMON_OPT += -m64
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FEXTRALIB += -lstdc++
FCOMMON_OPT += -mp
endif
@@ -633,11 +743,11 @@ CCOMMON_OPT += -n32
else
CCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -loongson3 -static
endif
ifeq ($(CORE), LOONGSON3B)
ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -loongson3 -static
endif
@@ -667,35 +777,37 @@ FCOMMON_OPT += -m32
else
FCOMMON_OPT += -m64
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -xopenmp=parallel
endif
endif
ifeq ($(F_COMPILER), COMPAQ)
CCOMMON_OPT += -DF_INTERFACE_COMPAQ
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
ifdef BINARY64
ifdef INTERFACE64
CCOMMON_OPT +=
ifneq ($(INTERFACE64), 0)
CCOMMON_OPT +=
#-DUSE64BITINT
endif
endif
endif
ifeq ($(NEED_PIC), 1)
ifeq ($(C_COMPILER), IBM)
CCOMMON_OPT += -qpic=large
CCOMMON_OPT += -qpic=large
else
CCOMMON_OPT += -fPIC
CCOMMON_OPT += -fPIC
endif
ifeq ($(F_COMPILER), SUN)
FCOMMON_OPT += -pic
else
FCOMMON_OPT += -fPIC
FCOMMON_OPT += -fPIC
endif
endif
@@ -717,6 +829,14 @@ ifeq ($(NO_AVX), 1)
CCOMMON_OPT += -DNO_AVX
endif
ifeq ($(ARCH), x86)
CCOMMON_OPT += -DNO_AVX
endif
ifeq ($(NO_AVX2), 1)
CCOMMON_OPT += -DNO_AVX2
endif
ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
@@ -732,6 +852,10 @@ ifeq ($(USE_OPENMP), 1)
CCOMMON_OPT += -DUSE_OPENMP
endif
ifeq ($(BIGNUMA), 1)
CCOMMON_OPT += -DBIGNUMA
endif
endif
ifeq ($(NO_WARMUP), 1)
@@ -763,10 +887,18 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif
ifndef SYMBOLPREFIX
SYMBOLPREFIX =
endif
ifndef SYMBOLSUFFIX
SYMBOLSUFFIX =
endif
ifndef LIBNAMESUFFIX
LIBPREFIX = libopenblas
LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)
else
LIBPREFIX = libopenblas_$(LIBNAMESUFFIX)
LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
endif
KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
@@ -827,17 +959,18 @@ ifeq ($(OSNAME), SunOS)
TAR = gtar
PATCH = gpatch
GREP = ggrep
AWK = nawk
else
TAR = tar
PATCH = patch
GREP = grep
AWK = awk
endif
ifndef MD5SUM
MD5SUM = md5sum
endif
AWK = awk
REVISION = -r$(VERSION)
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
@@ -846,40 +979,45 @@ ifeq ($(DEBUG), 1)
COMMON_OPT += -g
endif
ifndef COMMON_OPT
ifeq ($(ARCH), arm)
COMMON_OPT = -O3
ifeq ($(DEBUG), 1)
FCOMMON_OPT += -g
endif
endif
ifndef COMMON_OPT
ifeq ($(ARCH), arm64)
COMMON_OPT = -O3
endif
endif
ifndef COMMON_OPT
COMMON_OPT = -O2
endif
ifndef FCOMMON_OPT
FCOMMON_OPT = -O2 -frecursive
endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
override FFLAGS += $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
#For LAPACK Fortran codes.
#Disable -fopenmp for LAPACK Fortran codes on Windows.
ifdef OS_WINDOWS
LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS))
LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS))
else
LAPACK_FFLAGS := $(FFLAGS)
LAPACK_FPFLAGS := $(FPFLAGS)
endif
LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
LAPACK_CFLAGS += -DLAPACK_ILP64
endif
endif
ifdef OS_WINDOWS
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
endif
@@ -1049,4 +1187,3 @@ SUNPATH = /opt/sunstudio12.1
else
SUNPATH = /opt/SUNWspro
endif

View File

@@ -57,7 +57,7 @@ commonlibs :: $(COMMONOBJS)
commonprof :: $(COMMONOBJS_P)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^
quick :
quick :
$(MAKE) -C $(TOPDIR) libs
bms.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h
@@ -386,7 +386,7 @@ kbench_rank_k: kbench_rank_k.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS
smallbench: smallbench.$(SUFFIX) $(BLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS)
$(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB)
smallbench.mkl: smallbench.$(SUFFIX)
smallbench.mkl: smallbench.$(SUFFIX)
$(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB)
bench.sun: bench.$(SUFFIX) $(OBJS)
@@ -410,7 +410,7 @@ bench.acml: bench.$(SUFFIX) $(OBJS)
bench.flame: bench.$(SUFFIX) $(OBJS)
$(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB)
kbench.mkl: kbench.$(SUFFIX) $(OBJS)
kbench.mkl: kbench.$(SUFFIX) $(OBJS)
$(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB)
bench.mkl: bench.$(SUFFIX) $(OBJS)
@@ -537,10 +537,10 @@ params.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F)
paramd.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F)
$(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F)
paramq.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F)
$(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F)
paramc.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F)
@@ -555,10 +555,10 @@ params-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F)
paramd-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F)
$(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F)
paramq-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F)
$(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F)
paramc-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F)

View File

@@ -14,7 +14,7 @@ endif
# LIBMKL = -L$(MKLPATH)/32 -lmkl_lapack -lmkl_ia32 -lguide -lpthread -lm
ifndef SMP
LIBATLAS = -L$(ATLAS) -lf77blas -latlas -lg2c -lm
LIBATLAS = -L$(ATLAS) -lf77blas -latlas -lg2c -lm
else
LIBATLAS = -L$(ATLAS) -lptf77blas -latlas -lpthread -lg2c -lm
endif
@@ -50,7 +50,7 @@ LIBSUNPERF = -L/opt/SUNWspro/lib/sse2 -Wl,-R,/opt/SUNWspro/lib/sse2 -lsunperf
LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib
ifndef SMP
LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm
LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm
else
LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm
endif

View File

@@ -28,7 +28,7 @@ endif
ifndef SMP
LIBATLAS = -L$(ATLASPATH)64 -llapack -lcblas -lf77blas -latlas -lm
LIBATLAS = -L$(ATLASPATH)64 -llapack -lcblas -lf77blas -latlas -lm
else
LIBATLAS = -L$(ATLASPATH)64 -llapack -lptcblas -lptf77blas -latlas -lpthread -lm
endif

View File

@@ -1,9 +1,12 @@
# OpenBLAS
[![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
[![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
@@ -55,26 +58,35 @@ Please read GotoBLAS_01Readme.txt
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge).
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
- **AMD PILEDRIVER**: Used Bulldozer codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
- **ICT Loongson 3B**: Experimental
#### ARM:
- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
#### ARM64:
- **ARMV8**: Experimental
- **ARM Cortex-A57**: Experimental
### Support OS:
- **GNU/Linux**
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
## Usages
Link with libopenblas.a or -lopenblas for shared library.
### Set the number of threads with environment variables.
### Set the number of threads with environment variables.
Examples:
@@ -84,7 +96,7 @@ Examples:
export GOTO_NUM_THREADS=4
or
or
export OMP_NUM_THREADS=4
@@ -92,7 +104,7 @@ The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
### Set the number of threads on runtime.
### Set the number of threads on runtime.
We provided the below functions to control the number of threads on runtime.
@@ -116,12 +128,12 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256.
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## Contributing
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
1. Write a test which shows that the bug was fixed or that the feature works as expected.
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.

View File

@@ -19,6 +19,7 @@ PENRYN
DUNNINGTON
NEHALEM
SANDYBRIDGE
HASWELL
ATOM
b)AMD CPU:
@@ -30,6 +31,9 @@ SHANGHAI
ISTANBUL
BOBCAT
BULLDOZER
PILEDRIVER
STEAMROLLER
EXCAVATOR
c)VIA CPU:
SSE_GENERIC
@@ -40,6 +44,8 @@ NANO
POWER4
POWER5
POWER6
POWER7
POWER8
PPCG4
PPC970
PPC970MP
@@ -59,3 +65,14 @@ ITANIUM2
SPARC
SPARCV7
6.ARM CPU:
CORTEXA15
CORTEXA9
ARMV7
ARMV6
ARMV5
7.ARM 64-bit CPU:
ARMV8
CORTEXA57

199
USAGE.md Normal file
View File

@@ -0,0 +1,199 @@
# Notes on OpenBLAS usage
## Usage
#### Program is Terminated. Because you tried to allocate too many memory regions
In OpenBLAS, we mange a pool of memory buffers and allocate the number of
buffers as the following.
```
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
```
This error indicates that the program exceeded the number of buffers.
Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
`MAX_CPU_NUMBER=NUM_THREADS`.
#### How can I use OpenBLAS in multi-threaded applications?
If your application is already multi-threaded, it will conflict with OpenBLAS
multi-threading. Thus, you must set OpenBLAS to use single thread in any of the
following ways:
* `export OPENBLAS_NUM_THREADS=1` in the environment variables.
* Call `openblas_set_num_threads(1)` in the application on runtime.
* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0`
If the application is parallelized by OpenMP, please use OpenBLAS built with
`USE_OPENMP=1`
#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH
The environment variable which control the kernel selection is
`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export
OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()`
returns the used target.
#### How could I disable OpenBLAS threading affinity on runtime?
You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment
variable to disable threading affinity on runtime. For example, before the
running,
```
export OPENBLAS_MAIN_FREE=1
```
Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1`
in `Makefile.rule`.
## Linking with the library
* Link with shared library
`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas`
If the library is multithreaded, please add `-lpthread`. If the library
contains LAPACK functions, please add `-lgfortran` or other Fortran libs.
* Link with static library
`gcc -o test test.c /your/path/libopenblas.a`
You can download `test.c` from https://gist.github.com/xianyi/5780018
On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by
default), custom programs statically linked against `libopenblas.a` should also
link with the pthread library e.g.:
```
gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread
```
Failing to add the `-lpthread` flag will cause errors such as:
```
/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory':
memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock'
memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock'
...
```
## Code examples
#### Call CBLAS interface
This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
```
#include <cblas.h>
#include <stdio.h>
void main()
{
int i=0;
double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);
for(i=0; i<9; i++)
printf("%lf ", C[i]);
printf("\n");
}
```
`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran`
#### Call BLAS Fortran interface
This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
```
#include "stdio.h"
#include "stdlib.h"
#include "sys/time.h"
#include "time.h"
extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
int main(int argc, char* argv[])
{
int i;
printf("test!\n");
if(argc<4){
printf("Input Error\n");
return 1;
}
int m = atoi(argv[1]);
int n = atoi(argv[2]);
int k = atoi(argv[3]);
int sizeofa = m * k;
int sizeofb = k * n;
int sizeofc = m * n;
char ta = 'N';
char tb = 'N';
double alpha = 1.2;
double beta = 0.001;
struct timeval start,finish;
double duration;
double* A = (double*)malloc(sizeof(double) * sizeofa);
double* B = (double*)malloc(sizeof(double) * sizeofb);
double* C = (double*)malloc(sizeof(double) * sizeofc);
srand((unsigned)time(NULL));
for (i=0; i<sizeofa; i++)
A[i] = i%3+1;//(rand()%100)/10.0;
for (i=0; i<sizeofb; i++)
B[i] = i%3+1;//(rand()%100)/10.0;
for (i=0; i<sizeofc; i++)
C[i] = i%3+1;//(rand()%100)/10.0;
//#if 0
printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc);
gettimeofday(&start, NULL);
dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
gettimeofday(&finish, NULL);
duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000;
double gflops = 2.0 * m *n*k;
gflops = gflops/duration*1.0e-6;
FILE *fp;
fp = fopen("timeDGEMM.txt", "a");
fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops);
fclose(fp);
free(A);
free(B);
free(C);
return 0;
}
```
` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a`
` ./time_dgemm <m> <n> <k> `
## Troubleshooting
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## BLAS reference manual
If you want to understand every BLAS function and definition, please read
[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm)
or [netlib.org](http://netlib.org/blas/)
Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions)
## How to reference OpenBLAS.
You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications).
Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly.

44
appveyor.yml Normal file
View File

@@ -0,0 +1,44 @@
version: 0.2.18.{build}
#environment:
platform:
- x64
configuration: Release
clone_folder: c:\projects\OpenBLAS
init:
- git config --global core.autocrlf input
build:
project: OpenBLAS.sln
clone_depth: 5
#branches to build
branches:
only:
- master
- develop
- cmake
skip_tags: true
matrix:
fast_finish: true
skip_commits:
# Add [av skip] to commit messages
message: /\[av skip\]/
before_build:
- echo Running cmake...
- cd c:\projects\OpenBLAS
- cmake -G "Visual Studio 12 Win64" .
test_script:
- echo Running Test
- cd c:\projects\OpenBLAS\utest
- openblas_utest

9
benchmark/Make_exe.sh Executable file
View File

@@ -0,0 +1,9 @@
#!/bin/bash
for f in *.goto *.acml *.mkl *.atlas
do
if [ -f "$f" ]; then
mv $f `echo $f|tr '.' '_'`.exe
fi
done

File diff suppressed because it is too large Load Diff

196
benchmark/asum.c Normal file
View File

@@ -0,0 +1,196 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef ASUM
#ifdef COMPLEX
#ifdef DOUBLE
#define ASUM BLASFUNC(dzasum)
#else
#define ASUM BLASFUNC(scasum)
#endif
#else
#ifdef DOUBLE
#define ASUM BLASFUNC(dasum)
#else
#define ASUM BLASFUNC(sasum)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
FLOAT result;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = ASUM (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6);
#else
fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6);
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

201
benchmark/axpy.c Normal file
View File

@@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef AXPY
#ifdef COMPLEX
#ifdef DOUBLE
#define AXPY BLASFUNC(zaxpy)
#else
#define AXPY BLASFUNC(caxpy)
#endif
#else
#ifdef DOUBLE
#define AXPY BLASFUNC(daxpy)
#else
#define AXPY BLASFUNC(saxpy)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
AXPY (&m, alpha, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -71,36 +71,43 @@ double fabs(double);
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
static __inline double getmflops(int ratio, int m, double secs){
double mm = (double)m;
double mulflops, addflops;
@@ -117,9 +124,13 @@ static __inline double getmflops(int ratio, int m, double secs){
}
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
#ifndef COMPLEX
char *trans[] = {"T", "N"};
#else
char *trans[] = {"C", "N"};
#endif
char *uplo[] = {"U", "L"};
FLOAT alpha[] = {1.0, 0.0};
FLOAT beta [] = {0.0, 0.0};
@@ -137,7 +148,7 @@ int MAIN__(int argc, char *argv[]){
struct timeval start, stop;
double time1;
argc--;argv++;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
@@ -148,17 +159,17 @@ int MAIN__(int argc, char *argv[]){
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
for(m = from; m <= to; m += step){
fprintf(stderr, "M = %6d : ", (int)m);
for (uplos = 0; uplos < 2; uplos ++) {
#ifndef COMPLEX
if (uplos & 1) {
for (j = 0; j < m; j++) {
@@ -219,11 +230,11 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, "Info = %d\n", info);
exit(1);
}
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
maxerr = 0.;
if (!(uplos & 1)) {
for (j = 0; j < m; j++) {
for(i = 0; i <= j; i++) {
@@ -247,8 +258,8 @@ int MAIN__(int argc, char *argv[]){
}
}
}
fprintf(stderr,
fprintf(stderr,
#ifdef XDOUBLE
" %Le %10.3f MFlops", maxerr,
#else
@@ -269,4 +280,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

201
benchmark/copy.c Normal file
View File

@@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef COPY
#ifdef COMPLEX
#ifdef DOUBLE
#define COPY BLASFUNC(zcopy)
#else
#define COPY BLASFUNC(ccopy)
#endif
#else
#ifdef DOUBLE
#define COPY BLASFUNC(dcopy)
#else
#define COPY BLASFUNC(scopy)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
COPY (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

195
benchmark/dot.c Normal file
View File

@@ -0,0 +1,195 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(ddot)
#else
#define DOT BLASFUNC(sdot)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = DOT (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

261
benchmark/geev.c Normal file
View File

@@ -0,0 +1,261 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GEEV
#ifndef COMPLEX
#ifdef XDOUBLE
#define GEEV BLASFUNC(qgeev)
#elif defined(DOUBLE)
#define GEEV BLASFUNC(dgeev)
#else
#define GEEV BLASFUNC(sgeev)
#endif
#else
#ifdef XDOUBLE
#define GEEV BLASFUNC(xgeev)
#elif defined(DOUBLE)
#define GEEV BLASFUNC(zgeev)
#else
#define GEEV BLASFUNC(cgeev)
#endif
#endif
#ifndef COMPLEX
extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a,
blasint* lda, FLOAT* wr, FLOAT* wi, FLOAT* vl, blasint* ldvl,
FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, blasint* info );
#else
extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a,
blasint* lda, FLOAT* wr, FLOAT* vl, blasint* ldvl,
FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info );
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
FLOAT wkopt[4];
char job='V';
char jobr='N';
char *p;
blasint m, i, j, info,lwork;
double factor = 26.33;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_JOB"))) job=*p;
if ( job == 'N' ) factor = 10.0;
fprintf(stderr, "From : %3d To : %3d Step = %3d Job=%c\n", from, to, step,job);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( vl = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( vr = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( wr = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( wi = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( rwork = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
for(j = 0; j < to; j++){
for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
lwork = -1;
m=to;
#ifndef COMPLEX
GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
#else
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
#endif
lwork = (blasint)wkopt[0];
if (( work = (FLOAT *)malloc(sizeof(FLOAT) * lwork * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE FLops Time Lwork\n");
for(m = from; m <= to; m += step){
fprintf(stderr, " %6d : ", (int)m);
gettimeofday( &start, (struct timezone *)0);
lwork = -1;
#ifndef COMPLEX
GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
#else
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
#endif
lwork = (blasint)wkopt[0];
#ifndef COMPLEX
GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info);
#else
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
#endif
gettimeofday( &stop, (struct timezone *)0);
if (info) {
fprintf(stderr, "failed to compute eigenvalues .. %d\n", info);
exit(1);
}
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
fprintf(stderr,
" %10.2f MFlops : %10.2f Sec : %d\n",
COMPSIZE * COMPSIZE * factor * (double)m * (double)m * (double)m / time1 * 1.e-6,time1,lwork);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

223
benchmark/gemm.c Normal file
View File

@@ -0,0 +1,223 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GEMM
#ifndef COMPLEX
#ifdef DOUBLE
#define GEMM BLASFUNC(dgemm)
#else
#define GEMM BLASFUNC(sgemm)
#endif
#else
#ifdef DOUBLE
#define GEMM BLASFUNC(zgemm)
#else
#define GEMM BLASFUNC(cgemm)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {0.0, 0.0};
char trans='N';
blasint m, n, i, j;
int loops = 1;
int has_param_n=0;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
p = getenv("OPENBLAS_LOOPS");
if ( p != NULL )
loops = atoi(p);
if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p);
has_param_n=1;
}
#ifdef linux
srandom(getpid());
#endif
for(j = 0; j < to; j++){
for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
fprintf(stderr, " SIZE Flops Time\n");
for(m = from; m <= to; m += step)
{
timeg=0;
if ( has_param_n == 1 && n <= m )
n=n;
else
n=m;
fprintf(stderr, " %6dx%d : ", (int)m, (int)n);
gettimeofday( &start, (struct timezone *)0);
for (l=0; l<loops; l++)
{
GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m );
}
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg = time1/loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

212
benchmark/gemm3m.c Normal file
View File

@@ -0,0 +1,212 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GEMM
#ifndef COMPLEX
#ifdef DOUBLE
#define GEMM BLASFUNC(dgemm)
#else
#define GEMM BLASFUNC(sgemm)
#endif
#else
#ifdef DOUBLE
#define GEMM BLASFUNC(zgemm3m)
#else
#define GEMM BLASFUNC(cgemm3m)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char trans='N';
blasint m, i, j;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
p = getenv("OPENBLAS_LOOPS");
if ( p != NULL )
loops = atoi(p);
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

269
benchmark/gemv.c Normal file
View File

@@ -0,0 +1,269 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GEMV
#ifndef COMPLEX
#ifdef DOUBLE
#define GEMV BLASFUNC(dgemv)
#else
#define GEMV BLASFUNC(sgemv)
#endif
#else
#ifdef DOUBLE
#define GEMV BLASFUNC(zgemv)
#else
#define GEMV BLASFUNC(cgemv)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char trans='N';
blasint m, i, j;
blasint inc_x=1,inc_y=1;
blasint n=0;
int has_param_n = 0;
int has_param_m = 0;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
int tomax = to;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p);
if ((n>0)) has_param_n = 1;
if ( n > tomax ) tomax = n;
}
if ( has_param_n == 0 )
if ((p = getenv("OPENBLAS_PARAM_M"))) {
m = atoi(p);
if ((m>0)) has_param_m = 1;
if ( m > tomax ) tomax = m;
}
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
if (has_param_m == 0)
{
for(m = from; m <= to; m += step)
{
timeg=0;
if ( has_param_n == 0 ) n = m;
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l=0; l<loops; l++)
{
for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
}
}
else
{
for(n = from; n <= to; n += step)
{
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l=0; l<loops; l++)
{
for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
}
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

225
benchmark/ger.c Normal file
View File

@@ -0,0 +1,225 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GER
#ifdef COMPLEX
#ifdef DOUBLE
#define GER BLASFUNC(zgeru)
#else
#define GER BLASFUNC(cgeru)
#endif
#else
#ifdef DOUBLE
#define GER BLASFUNC(dger)
#else
#define GER BLASFUNC(sger)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
blasint m, i, j;
blasint inc_x=1,inc_y=1;
blasint n=0;
int has_param_n = 0;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p);
if ((n>0) && (n<=to)) has_param_n = 1;
}
if ( has_param_n == 1 )
fprintf(stderr, "From : %3d To : %3d Step = %3d N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,n,inc_x,inc_y,loops);
else
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
if ( has_param_n == 0 ) n = m;
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (l=0; l<loops; l++)
{
gettimeofday( &start, (struct timezone *)0);
GER (&m, &n, alpha, x, &inc_x, y, &inc_y, a , &m);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

218
benchmark/gesv.c Normal file
View File

@@ -0,0 +1,218 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
double fabs(double);
#undef GESV
#undef GETRS
#ifndef COMPLEX
#ifdef XDOUBLE
#define GESV BLASFUNC(qgesv)
#elif defined(DOUBLE)
#define GESV BLASFUNC(dgesv)
#else
#define GESV BLASFUNC(sgesv)
#endif
#else
#ifdef XDOUBLE
#define GESV BLASFUNC(xgesv)
#elif defined(DOUBLE)
#define GESV BLASFUNC(zgesv)
#else
#define GESV BLASFUNC(cgesv)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b;
blasint *ipiv;
blasint m, i, j, info;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time\n");
for(m = from; m <= to; m += step){
fprintf(stderr, " %dx%d : ", (int)m, (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
b[i + j * m * COMPSIZE] = 0.0;
}
}
for (j = 0; j < m; ++j) {
for (i = 0; i < m * COMPSIZE; ++i) {
b[i] += a[i + j * m * COMPSIZE];
}
}
gettimeofday( &start, (struct timezone *)0);
GESV (&m, &m, a, &m, ipiv, b, &m, &info);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
fprintf(stderr,
"%10.2f MFlops %10.6f s\n",
COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

234
benchmark/getri.c Normal file
View File

@@ -0,0 +1,234 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GETRF
#undef GETRI
#ifndef COMPLEX
#ifdef XDOUBLE
#define GETRF BLASFUNC(qgetrf)
#define GETRI BLASFUNC(qgetri)
#elif defined(DOUBLE)
#define GETRF BLASFUNC(dgetrf)
#define GETRI BLASFUNC(dgetri)
#else
#define GETRF BLASFUNC(sgetrf)
#define GETRI BLASFUNC(sgetri)
#endif
#else
#ifdef XDOUBLE
#define GETRF BLASFUNC(xgetrf)
#define GETRI BLASFUNC(xgetri)
#elif defined(DOUBLE)
#define GETRF BLASFUNC(zgetrf)
#define GETRI BLASFUNC(zgetri)
#else
#define GETRF BLASFUNC(cgetrf)
#define GETRI BLASFUNC(cgetri)
#endif
#endif
extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info);
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a,*work;
FLOAT wkopt[4];
blasint *ipiv;
blasint m, i, j, info,lwork;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
for(j = 0; j < to; j++){
for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
lwork = -1;
m=to;
GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
lwork = (blasint)wkopt[0];
if (( work = (FLOAT *)malloc(sizeof(FLOAT) * lwork * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE FLops Time Lwork\n");
for(m = from; m <= to; m += step){
fprintf(stderr, " %6d : ", (int)m);
GETRF (&m, &m, a, &m, ipiv, &info);
if (info) {
fprintf(stderr, "Matrix is not singular .. %d\n", info);
exit(1);
}
gettimeofday( &start, (struct timezone *)0);
lwork = -1;
GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
lwork = (blasint)wkopt[0];
GETRI(&m, a, &m, ipiv, work, &lwork, &info);
gettimeofday( &stop, (struct timezone *)0);
if (info) {
fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
exit(1);
}
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
fprintf(stderr,
" %10.2f MFlops : %10.2f Sec : %d\n",
COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

192
benchmark/hemm.c Normal file
View File

@@ -0,0 +1,192 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HEMM
#ifdef DOUBLE
#define HEMM BLASFUNC(zhemm)
#else
#define HEMM BLASFUNC(chemm)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char *p;
char side='L';
char uplo='U';
if ((p = getenv("OPENBLAS_SIDE"))) side=*p;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c\n", from, to, step,side,uplo);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

208
benchmark/hemv.c Normal file
View File

@@ -0,0 +1,208 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HEMV
#ifdef DOUBLE
#define HEMV BLASFUNC(zhemv)
#else
#define HEMV BLASFUNC(chemv)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char uplo='L';
blasint m, i, j;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

191
benchmark/her2k.c Normal file
View File

@@ -0,0 +1,191 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HER2K
#ifdef DOUBLE
#define HER2K BLASFUNC(zher2k)
#else
#define HER2K BLASFUNC(cher2k)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char *p;
char uplo='U';
char trans='N';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

189
benchmark/herk.c Normal file
View File

@@ -0,0 +1,189 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HERK
#ifdef DOUBLE
#define HERK BLASFUNC(zherk)
#else
#define HERK BLASFUNC(cherk)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char *p;
char uplo='U';
char trans='N';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -83,22 +83,22 @@ int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
@@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b;
blasint *ipiv;
@@ -154,7 +154,7 @@ int MAIN__(int argc, char *argv[]){
struct timeval start, stop;
double time1, time2;
argc--;argv++;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
@@ -165,15 +165,15 @@ int MAIN__(int argc, char *argv[]){
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
@@ -181,7 +181,7 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, " SIZE Residual Decompose Solve Total\n");
for(m = from; m <= to; m += step){
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
@@ -189,9 +189,9 @@ int MAIN__(int argc, char *argv[]){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (i = 0; i < m * COMPSIZE; ++i) b[i] = 0.;
for (j = 0; j < m; ++j) {
for (i = 0; i < m * COMPSIZE; ++i) {
b[i] += a[i + j * m * COMPSIZE];
@@ -208,7 +208,7 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, "Matrix is not singular .. %d\n", info);
exit(1);
}
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
@@ -221,7 +221,7 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, "Matrix is not singular .. %d\n", info);
exit(1);
}
time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
maxerr = 0.;
@@ -239,7 +239,7 @@ int MAIN__(int argc, char *argv[]){
#endif
#endif
}
#ifdef XDOUBLE
fprintf(stderr," %Le ", maxerr);
#else
@@ -247,7 +247,7 @@ int MAIN__(int argc, char *argv[]){
#endif
fprintf(stderr,
" %10.2f MFlops %10.2f MFlops %10.2f MFlops\n",
" %10.2f MFlops %10.2f MFlops %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. / 3. * (double)m * (double)m * (double)m / time1 * 1.e-6,
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time2 * 1.e-6,
COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m) / (time1 + time2) * 1.e-6);
@@ -270,4 +270,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

65
benchmark/plot-filter.sh Executable file
View File

@@ -0,0 +1,65 @@
#!/bin/sh
# **********************************************************************************
# Copyright (c) 2014, The OpenBLAS Project
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. Neither the name of the OpenBLAS project nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# **********************************************************************************
# ************************************************************************
# sample filter for data output from benchmark programs
#
# usage example:
# ./dgemm.goto 2>&1|./plotfilter.sh >OpenBLAS
# ************************************************************************
if [ $# -eq 1 ]
then
arg1=$1
else
arg1=0
fi
case $arg1 in
L)
# Linpack Benchmark
awk '/MFlops/ { print $1,int($8) }'|tail --lines=+2
;;
C)
# Cholesky Benchmark
awk '/MFlops/ { print $3,int($9) }'|tail --lines=+2
;;
B)
# Copy Benchmark
awk '/MBytes/ { print $1,int($3) }'|tail --lines=+2
;;
*)
awk '/MFlops/ { print $1,int($3) }'|tail --lines=+2
;;
esac

42
benchmark/plot-header Normal file
View File

@@ -0,0 +1,42 @@
# **********************************************************************************
# Copyright (c) 2014, The OpenBLAS Project
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. Neither the name of the OpenBLAS project nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# **********************************************************************************
set term x11 font sans;
set ylabel "MFlops";
set xlabel "Size";
set grid xtics;
set grid ytics;
set key left;
set timestamp "generated on %Y-%m-%d by `whoami`"
set title "Dtrsm\nUPLO=U TRANS=N SIDE=L\nBulldozer 1 Thread"
plot 'OpenBLAS' smooth bezier, 'ACML' smooth bezier, 'MKL' smooth bezier;
set output "print.png";
show title;
show plot;
show output;

286
benchmark/potrf.c Normal file
View File

@@ -0,0 +1,286 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
double fabs(double);
#undef POTRF
#ifndef COMPLEX
#ifdef XDOUBLE
#define POTRF BLASFUNC(qpotrf)
#define POTRS BLASFUNC(qpotrs)
#define POTRI BLASFUNC(qpotri)
#define SYRK BLASFUNC(qsyrk)
#elif defined(DOUBLE)
#define POTRF BLASFUNC(dpotrf)
#define POTRS BLASFUNC(dpotrs)
#define POTRI BLASFUNC(dpotri)
#define SYRK BLASFUNC(dsyrk)
#else
#define POTRF BLASFUNC(spotrf)
#define POTRS BLASFUNC(spotrs)
#define POTRI BLASFUNC(spotri)
#define SYRK BLASFUNC(ssyrk)
#endif
#else
#ifdef XDOUBLE
#define POTRF BLASFUNC(xpotrf)
#define POTRS BLASFUNC(xpotrs)
#define POTRI BLASFUNC(xpotri)
#define SYRK BLASFUNC(xherk)
#elif defined(DOUBLE)
#define POTRF BLASFUNC(zpotrf)
#define POTRS BLASFUNC(zpotrs)
#define POTRI BLASFUNC(zpotri)
#define SYRK BLASFUNC(zherk)
#else
#define POTRF BLASFUNC(cpotrf)
#define POTRS BLASFUNC(cpotrs)
#define POTRI BLASFUNC(cpotri)
#define SYRK BLASFUNC(cherk)
#endif
#endif
// extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info);
// extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info);
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
int main(int argc, char *argv[]){
#ifndef COMPLEX
char *trans[] = {"T", "N"};
#else
char *trans[] = {"C", "N"};
#endif
char *uplo[] = {"U", "L"};
FLOAT alpha[] = {1.0, 0.0};
FLOAT beta [] = {0.0, 0.0};
FLOAT *a, *b;
char *p;
char btest = 'F';
blasint m, i, j, info, uplos=0;
double flops;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_UPLO")))
if (*p == 'L') uplos=1;
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
for(m = from; m <= to; m += step){
#ifndef COMPLEX
if (uplos & 1) {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) a[i + j * m] = 0.;
a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5;
}
} else {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[i + j * m] = 0.;
}
}
#else
if (uplos & 1) {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) {
a[(i + j * m) * 2 + 0] = 0.;
a[(i + j * m) * 2 + 1] = 0.;
}
a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
a[(j + j * m) * 2 + 1] = 0.;
for(i = j + 1; i < m; i++) {
a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
}
}
} else {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) {
a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
}
a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
a[(j + j * m) * 2 + 1] = 0.;
for(i = j + 1; i < m; i++) {
a[(i + j * m) * 2 + 0] = 0.;
a[(i + j * m) * 2 + 1] = 0.;
}
}
}
#endif
SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
gettimeofday( &start, (struct timezone *)0);
POTRF(uplo[uplos], &m, b, &m, &info);
gettimeofday( &stop, (struct timezone *)0);
if (info != 0) {
fprintf(stderr, "Potrf info = %d\n", info);
exit(1);
}
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
if ( btest == 'S' )
{
for(j = 0; j < to; j++){
for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
POTRS(uplo[uplos], &m, &m, b, &m, a, &m, &info);
gettimeofday( &stop, (struct timezone *)0);
if (info != 0) {
fprintf(stderr, "Potrs info = %d\n", info);
exit(1);
}
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
}
if ( btest == 'I' )
{
gettimeofday( &start, (struct timezone *)0);
POTRI(uplo[uplos], &m, b, &m, &info);
gettimeofday( &stop, (struct timezone *)0);
if (info != 0) {
fprintf(stderr, "Potri info = %d\n", info);
exit(1);
}
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
}
fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

197
benchmark/rot.c Normal file
View File

@@ -0,0 +1,197 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define ROT BLASFUNC(drot)
#else
#define ROT BLASFUNC(srot)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
// FLOAT result;
blasint m, i;
blasint inc_x=1,inc_y=1;
FLOAT c[1] = { 2.0 };
FLOAT s[1] = { 2.0 };
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
ROT (&m, x, &inc_x, y, &inc_y, c, s);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

202
benchmark/scal.c Normal file
View File

@@ -0,0 +1,202 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SCAL
#ifdef COMPLEX
#ifdef DOUBLE
#define SCAL BLASFUNC(zscal)
#else
#define SCAL BLASFUNC(cscal)
#endif
#else
#ifdef DOUBLE
#define SCAL BLASFUNC(dscal)
#else
#define SCAL BLASFUNC(sscal)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SCAL (&m, alpha, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6);
#else
fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6);
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_cgemm(N,l):
A = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j;
B = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_cgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_cgemv(N,l):
A = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j;
B = randn(N).astype('float32') + randn(N).astype('float32') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_cgemv(i,LOOPS)

View File

@@ -0,0 +1,58 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
from scipy.linalg.blas import daxpy
def run_daxpy(N,l):
x = randn(N).astype('float64')
y = randn(N).astype('float64')
start = time.time();
for i in range(0,l):
y = daxpy(x,y, a=2.0 )
end = time.time()
timediff = (end -start)
mflops = ( 2*N ) *l / timediff
mflops *= 1e-6
size = "%d" % (N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_daxpy(i,LOOPS)

56
benchmark/scripts/NUMPY/ddot.py Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_ddot(N,l):
A = randn(N).astype('float64')
B = randn(N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N ) *l / timediff
mflops *= 1e-6
size = "%d" % (N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_ddot(i,LOOPS)

55
benchmark/scripts/NUMPY/deig.py Executable file
View File

@@ -0,0 +1,55 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_deig(N,l):
A = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
la,v = numpy.linalg.eig(A)
end = time.time()
timediff = (end -start)
mflops = ( 26.33 *N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_deig(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_dgemm(N,l):
A = randn(N,N).astype('float64')
B = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_dgemv(N,l):
A = randn(N,N).astype('float64')
B = randn(N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dgemv(i,LOOPS)

View File

@@ -0,0 +1,58 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
from scipy.linalg.lapack import dgesv
def run_dgesv(N,l):
a = randn(N,N).astype('float64')
b = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
dgesv(a,b,1,1)
end = time.time()
timediff = (end -start)
mflops = ( 2.0/3.0 *N*N*N + 2.0*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dgesv(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_dsolve(N,l):
A = randn(N,N).astype('float64')
B = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.linalg.solve(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2.0/3.0 *N*N*N + 2.0*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dsolve(i,LOOPS)

56
benchmark/scripts/NUMPY/sdot.py Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_sdot(N,l):
A = randn(N).astype('float32')
B = randn(N).astype('float32')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N ) *l / timediff
mflops *= 1e-6
size = "%d" % (N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_sdot(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_sgemm(N,l):
A = randn(N,N).astype('float32')
B = randn(N,N).astype('float32')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_sgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_sgemv(N,l):
A = randn(N,N).astype('float32')
B = randn(N).astype('float32')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_sgemv(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_zgemm(N,l):
A = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j;
B = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_zgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_zgemv(N,l):
A = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j;
B = randn(N).astype('float64') + randn(N).astype('float64') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_zgemv(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n)) + single(rand(n,n)) * 1i;
B = single(rand(n,n)) + single(rand(n,n)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n)) + single(rand(n,n)) * 1i;
B = single(rand(n,1)) + single(rand(n,1)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

56
benchmark/scripts/OCTAVE/deig.m Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
start = clock();
l=0;
while l < loops
[V,lambda] = eig(A);
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 26.33 *n*n*n ) *loops / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg );
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
B = double(rand(n,n));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
B = double(rand(n,1));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,59 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
B = double(rand(n,n));
start = clock();
l=0;
while l < loops
x = linsolve(A,B);
#x = A / B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
#r = norm(A*x - B)/norm(B)
mflops = ( 2.0/3.0 *n*n*n + 2.0*n*n*n ) *loops / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg );
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n));
B = single(rand(n,n));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n));
B = single(rand(n,1));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n)) + double(rand(n,n)) * 1i;
B = double(rand(n,n)) + double(rand(n,n)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n)) + double(rand(n,n)) * 1i;
B = double(rand(n,1)) + double(rand(n,1)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

62
benchmark/scripts/R/deig.R Executable file
View File

@@ -0,0 +1,62 @@
#!/usr/bin/Rscript
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
start <- proc.time()[3]
while ( l <= loops ) {
ev <- eigen(A)
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = (26.66 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
}

63
benchmark/scripts/R/dgemm.R Executable file
View File

@@ -0,0 +1,63 @@
#!/usr/bin/Rscript
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
start <- proc.time()[3]
while ( l <= loops ) {
C <- A %*% B
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = ( 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
}

63
benchmark/scripts/R/dsolve.R Executable file
View File

@@ -0,0 +1,63 @@
#!/usr/bin/Rscript
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
start <- proc.time()[3]
while ( l <= loops ) {
solve(A,B)
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = (2.0/3.0 *n*n*n + 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
}

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env python
import os
import sys
import time
import numpy
from numpy import zeros
from numpy.random import randn
from scipy.linalg import blas
def run_dsyrk(N, l):
A = randn(N, N).astype('float64', order='F')
C = zeros((N, N), dtype='float64', order='F')
start = time.time()
for i in range(0, l):
blas.dsyrk(1.0, A, c=C, overwrite_c=True)
end = time.time()
timediff = (end - start)
mflops = (N * N * N) * l / timediff
mflops *= 1e-6
size = "%dx%d" % (N, N)
print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
if __name__ == "__main__":
N = 128
NMAX = 2048
NINC = 128
LOOPS = 1
z = 0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p)
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range(N, NMAX + NINC, NINC):
run_dsyrk(i, LOOPS)

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env python
import os
import sys
import time
import numpy
from numpy import zeros
from numpy.random import randn
from scipy.linalg import blas
def run_ssyrk(N, l):
A = randn(N, N).astype('float32', order='F')
C = zeros((N, N), dtype='float32', order='F')
start = time.time()
for i in range(0, l):
blas.ssyrk(1.0, A, c=C, overwrite_c=True)
end = time.time()
timediff = (end - start)
mflops = (N * N * N) * l / timediff
mflops *= 1e-6
size = "%dx%d" % (N, N)
print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
if __name__ == "__main__":
N = 128
NMAX = 2048
NINC = 128
LOOPS = 1
z = 0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p)
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range(N, NMAX + NINC, NINC):
run_ssyrk(i, LOOPS)

196
benchmark/smallscaling.c Normal file
View File

@@ -0,0 +1,196 @@
// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <cblas.h>
#include <omp.h>
#define MIN_SIZE 5
#define MAX_SIZE 60
#define NB_SIZE 10
// number of loop for a 1x1 matrix. Lower it if the test is
// too slow on you computer.
#define NLOOP 2e7
typedef struct {
int matrix_size;
int n_loop;
void (* bench_func)();
void (* blas_func)();
void * (* create_matrix)(int size);
} BenchParam;
void * s_create_matrix(int size) {
float * r = malloc(size * sizeof(double));
int i;
for(i = 0; i < size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * c_create_matrix(int size) {
float * r = malloc(size * 2 * sizeof(double));
int i;
for(i = 0; i < 2 * size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * z_create_matrix(int size) {
double * r = malloc(size * 2 * sizeof(double));
int i;
for(i = 0; i < 2 * size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * d_create_matrix(int size) {
double * r = malloc(size * sizeof(double));
int i;
for(i = 0; i < size; i++)
r[i] = 1e3 * i / size;
return r;
}
void trmv_bench(BenchParam * param)
{
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func("U", "N", "N", &size, A, &size, y, &one);
}
free(A);
free(y);
}
void gemv_bench(BenchParam * param)
{
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
double v = 1.01;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
}
free(A);
free(y);
}
void ger_bench(BenchParam * param) {
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
double v = 1.01;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
}
free(A);
free(y);
}
#ifndef _WIN32
void * pthread_func_wrapper(void * param) {
((BenchParam *)param)->bench_func(param);
pthread_exit(NULL);
}
#endif
#define NB_TESTS 5
void * TESTS[4 * NB_TESTS] = {
trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
gemv_bench, dgemv_, d_create_matrix, "dgemv",
gemv_bench, zgemv_, z_create_matrix, "zgemv",
ger_bench, dger_, d_create_matrix, "dger",
ger_bench, zgerc_, z_create_matrix, "zgerc",
};
inline static double delta_time(struct timespec tick) {
struct timespec tock;
clock_gettime(CLOCK_MONOTONIC, &tock);
return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
}
double pthread_bench(BenchParam * param, int nb_threads)
{
#ifdef _WIN32
return 0;
#else
BenchParam threaded_param = *param;
pthread_t threads[nb_threads];
int t, rc;
struct timespec tick;
threaded_param.n_loop /= nb_threads;
clock_gettime(CLOCK_MONOTONIC, &tick);
for(t=0; t<nb_threads; t++){
rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
for(t=0; t<nb_threads; t++){
pthread_join(threads[t], NULL);
}
return delta_time(tick);
#endif
}
double seq_bench(BenchParam * param) {
struct timespec tick;
clock_gettime(CLOCK_MONOTONIC, &tick);
param->bench_func(param);
return delta_time(tick);
}
double omp_bench(BenchParam * param) {
BenchParam threaded_param = *param;
struct timespec tick;
int t;
int nb_threads = omp_get_max_threads();
threaded_param.n_loop /= nb_threads;
clock_gettime(CLOCK_MONOTONIC, &tick);
#pragma omp parallel for
for(t = 0; t < nb_threads; t ++){
param->bench_func(&threaded_param);
}
return delta_time(tick);
}
int main(int argc, char * argv[]) {
double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
BenchParam param;
int test_id;
printf ("Running on %d threads\n", omp_get_max_threads());
for(test_id = 0; test_id < NB_TESTS; test_id ++) {
double size = MIN_SIZE;
param.bench_func = TESTS[test_id * 4];
param.blas_func = TESTS[test_id * 4 + 1];
param.create_matrix = TESTS[test_id * 4 + 2];
printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
param.n_loop = NLOOP;
while(size <= MAX_SIZE) {
param.matrix_size = (int)(size + 0.5);
double seq_time = seq_bench(&param);
double omp_time = omp_bench(&param);
double pthread_time = pthread_bench(&param, omp_get_max_threads());
printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
"pthread %gs, speedup %g\n",
param.matrix_size, seq_time,
omp_time, seq_time / omp_time,
pthread_time, seq_time / pthread_time);
size *= inc_factor;
}
}
return(0);
}

201
benchmark/swap.c Normal file
View File

@@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above swapright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above swapright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE SWAPRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SWAP
#ifdef COMPLEX
#ifdef DOUBLE
#define SWAP BLASFUNC(zswap)
#else
#define SWAP BLASFUNC(cswap)
#endif
#else
#ifdef DOUBLE
#define SWAP BLASFUNC(dswap)
#else
#define SWAP BLASFUNC(sswap)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SWAP (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

203
benchmark/symm.c Normal file
View File

@@ -0,0 +1,203 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYMM
#ifndef COMPLEX
#ifdef DOUBLE
#define SYMM BLASFUNC(dsymm)
#else
#define SYMM BLASFUNC(ssymm)
#endif
#else
#ifdef DOUBLE
#define SYMM BLASFUNC(zsymm)
#else
#define SYMM BLASFUNC(csymm)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char *p;
char side='L';
char uplo='U';
if ((p = getenv("OPENBLAS_SIDE"))) side=*p;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c\n", from, to, step,side,uplo);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

218
benchmark/symv.c Normal file
View File

@@ -0,0 +1,218 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYMV
#ifndef COMPLEX
#ifdef DOUBLE
#define SYMV BLASFUNC(dsymv)
#else
#define SYMV BLASFUNC(ssymv)
#endif
#else
#ifdef DOUBLE
#define SYMV BLASFUNC(zsymv)
#else
#define SYMV BLASFUNC(csymv)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char uplo='L';
blasint m, i, j;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

203
benchmark/syr2k.c Normal file
View File

@@ -0,0 +1,203 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYR2K
#ifndef COMPLEX
#ifdef DOUBLE
#define SYR2K BLASFUNC(dsyr2k)
#else
#define SYR2K BLASFUNC(ssyr2k)
#endif
#else
#ifdef DOUBLE
#define SYR2K BLASFUNC(zsyr2k)
#else
#define SYR2K BLASFUNC(csyr2k)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char *p;
char uplo='U';
char trans='N';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

199
benchmark/syrk.c Normal file
View File

@@ -0,0 +1,199 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYRK
#ifndef COMPLEX
#ifdef DOUBLE
#define SYRK BLASFUNC(dsyrk)
#else
#define SYRK BLASFUNC(ssyrk)
#endif
#else
#ifdef DOUBLE
#define SYRK BLASFUNC(zsyrk)
#else
#define SYRK BLASFUNC(csyrk)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char *p;
char uplo='U';
char trans='N';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

42
benchmark/tplot-header Normal file
View File

@@ -0,0 +1,42 @@
# **********************************************************************************
# Copyright (c) 2014, The OpenBLAS Project
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. Neither the name of the OpenBLAS project nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# **********************************************************************************
set term x11 font sans;
set ylabel "MFlops";
set xlabel "Size";
set grid xtics;
set grid ytics;
set key left;
set timestamp "generated on %Y-%m-%d by `whoami`"
set title "Sgemv\nTRANS=T\nBulldozer"
plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier;
set output "print.png";
show title;
show plot;
show output;

202
benchmark/trmm.c Normal file
View File

@@ -0,0 +1,202 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TRMM
#ifndef COMPLEX
#ifdef DOUBLE
#define TRMM BLASFUNC(dtrmm)
#else
#define TRMM BLASFUNC(strmm)
#endif
#else
#ifdef DOUBLE
#define TRMM BLASFUNC(ztrmm)
#else
#define TRMM BLASFUNC(ctrmm)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char *p;
char side ='L';
char uplo ='U';
char trans='N';
char diag ='U';
if ((p = getenv("OPENBLAS_SIDE"))) side=*p;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c\n", from, to, step,side,uplo,trans,diag);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

219
benchmark/trsm.c Normal file
View File

@@ -0,0 +1,219 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TRSM
#ifndef COMPLEX
#ifdef DOUBLE
#define TRSM BLASFUNC(dtrsm)
#else
#define TRSM BLASFUNC(strsm)
#endif
#else
#ifdef DOUBLE
#define TRSM BLASFUNC(ztrsm)
#else
#define TRSM BLASFUNC(ctrsm)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char *p;
char side ='L';
char uplo ='U';
char trans='N';
char diag ='U';
int l;
int loops = 1;
double timeg;
if ((p = getenv("OPENBLAS_SIDE"))) side=*p;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
p = getenv("OPENBLAS_LOOPS");
if ( p != NULL )
loops = atoi(p);
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c Loops = %d\n", from, to, step,side,uplo,trans,diag,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0.0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
time1 = timeg/loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

196
benchmark/zdot-intel.c Normal file
View File

@@ -0,0 +1,196 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#define RETURN_BY_STACK 1
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT _Complex result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
DOT (&result, &m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

195
benchmark/zdot.c Normal file
View File

@@ -0,0 +1,195 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT _Complex result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = DOT (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

29
c_check
View File

@@ -3,6 +3,10 @@
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
$hostarch = "x86_64" if ($hostarch eq "amd64");
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$binary = $ENV{"BINARY"};
@@ -27,7 +31,7 @@ if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
$cross_suffix = $1;
}
} else {
if ($ARGV[0] =~ /(.*-)(.*)/) {
if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) {
$cross_suffix = $1;
}
}
@@ -54,6 +58,7 @@ $os = osf if ($data =~ /OS_OSF/);
$os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
@@ -80,6 +85,10 @@ if (($architecture eq "mips32") || ($architecture eq "mips64")) {
$defined = 1;
}
if (($architecture eq "arm") || ($architecture eq "arm64")) {
$defined = 1;
}
if ($architecture eq "alpha") {
$defined = 1;
$binary = 64;
@@ -180,9 +189,9 @@ $linker_a = "";
{
$link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
$link =~ s/\-Y\sP\,/\-Y/g;
@flags = split(/[\s\,\n]/, $link);
# remove leading and trailing quotes from each flag.
@flags = map {s/^['"]|['"]$//g; $_} @flags;
@@ -193,15 +202,15 @@ $linker_a = "";
&& ($flags !~ /^-LIST:/)
&& ($flags !~ /^-LANG:/)
) {
$linker_L .= $flags . " "
$linker_L .= $flags . " "
}
if ($flags =~ /^\-Y/) {
$linker_L .= "-Wl,". $flags . " "
$linker_L .= "-Wl,". $flags . " "
}
if (
($flags =~ /^\-l/)
($flags =~ /^\-l/)
&& ($flags !~ /gfortranbegin/)
&& ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/)
@@ -213,7 +222,7 @@ $linker_a = "";
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
) {
$linker_l .= $flags . " "
$linker_l .= $flags . " "
}
$linker_a .= $flags . " " if $flags =~ /\.a$/;
@@ -250,9 +259,9 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
if ($os eq "LINUX") {
# @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
# if ($pthread[2] ne "") {
# print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
# } else {

58
cblas.h
View File

@@ -13,17 +13,26 @@ extern "C" {
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
/*Get the number of threads on runtime.*/
int openblas_get_num_threads(void);
/*Get the number of physical processors (cores).*/
int openblas_get_num_procs(void);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
/*Get the CPU corename on runtime.*/
char* openblas_get_corename(void);
/* Get the parallelization type which is used by OpenBLAS */
int openblas_get_parallel(void);
int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
#define OPENBLAS_THREAD 1
#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
#define OPENBLAS_OPENMP 2
#define OPENBLAS_OPENMP 2
/*
@@ -240,8 +249,13 @@ void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_cgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
@@ -305,6 +319,44 @@ void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL
void cblas_xerbla(blasint p, char *rout, char *form, ...);
/*** BLAS extensions ***/
void cblas_saxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float *beta, float *y, OPENBLAS_CONST blasint incy);
void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double *beta, double *y, OPENBLAS_CONST blasint incy);
void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a,
OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb);
void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a,
OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb);
void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a,
OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb);
void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a,
OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb);
void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta,
float *c, OPENBLAS_CONST blasint cldc);
void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta,
double *c, OPENBLAS_CONST blasint cldc);
void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta,
float *c, OPENBLAS_CONST blasint cldc);
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
double *c, OPENBLAS_CONST blasint cldc);
#ifdef __cplusplus
}
#endif /* __cplusplus */

View File

@@ -1,303 +0,0 @@
#ifndef CBLAS_H
#define CBLAS_H
#include <stddef.h>
#include "common.h"
#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */
/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
/* Get the parallelization type which is used by OpenBLAS */
int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
#define OPENBLAS_OPENMP 2
#define CBLAS_INDEX size_t
typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy);
double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);
openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy);
openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy);
openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
float cblas_sasum (blasint n, float *x, blasint incx);
double cblas_dasum (blasint n, double *x, blasint incx);
float cblas_scasum(blasint n, float *x, blasint incx);
double cblas_dzasum(blasint n, double *x, blasint incx);
float cblas_snrm2 (blasint N, float *X, blasint incX);
double cblas_dnrm2 (blasint N, double *X, blasint incX);
float cblas_scnrm2(blasint N, float *X, blasint incX);
double cblas_dznrm2(blasint N, double *X, blasint incX);
CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx);
CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx);
CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);
void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy);
void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy);
void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy);
void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s);
void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);
void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);
void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);
void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy);
void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy);
void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy);
void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy);
void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
float *Y, blasint incY, float *A, blasint lda);
void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
double *Y, blasint incY, double *A, blasint lda);
void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);
void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);
void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);
void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);
void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif

115
cmake/arch.cmake Normal file
View File

@@ -0,0 +1,115 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets various variables based on architecture.
if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64")
if (${ARCH} STREQUAL "x86")
if (NOT BINARY)
set(NO_BINARY_MODE 1)
endif ()
endif ()
if (NOT NO_EXPRECISION)
if (${F_COMPILER} MATCHES "GFORTRAN")
# N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB")
set(EXPRECISION 1)
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double")
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
set(EXPRECISION 1)
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION")
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
endif ()
endif ()
endif ()
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
set(CCOMMON_OPT "${CCOMMON_OPT} -wd981")
endif ()
if (USE_OPENMP)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB")
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
message(WARNING "Clang doesn't support OpenMP yet.")
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
set(CCOMMON_OPT "${CCOMMON_OPT} -openmp")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
set(CEXTRALIB "${CEXTRALIB} -lstdc++")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
endif ()
endif ()
if (DYNAMIC_ARCH)
if (${ARCH} STREQUAL "x86")
set(DYNAMIC_CORE "KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO")
endif ()
if (${ARCH} STREQUAL "x86_64")
set(DYNAMIC_CORE "PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO")
if (NOT NO_AVX)
set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER")
endif ()
if (NOT NO_AVX2)
set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL")
endif ()
endif ()
if (NOT DYNAMIC_CORE)
unset(DYNAMIC_ARCH)
endif ()
endif ()
if (${ARCH} STREQUAL "ia64")
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
if (${F_COMPILER} MATCHES "GFORTRAN")
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
# EXPRECISION = 1
# CCOMMON_OPT += -DEXPRECISION
endif ()
endif ()
endif ()
if (${ARCH} STREQUAL "mips64")
set(NO_BINARY_MODE 1)
endif ()
if (${ARCH} STREQUAL "alpha")
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
endif ()
if (${ARCH} STREQUAL "arm")
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
endif ()
if (${ARCH} STREQUAL "arm64")
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
endif ()

89
cmake/c_check.cmake Normal file
View File

@@ -0,0 +1,89 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from the OpenBLAS/c_check perl script.
## This is triggered by prebuild.cmake and runs before any of the code is built.
## Creates config.h and Makefile.conf.
# CMake vars set by this file:
# OSNAME (use CMAKE_SYSTEM_NAME)
# ARCH
# C_COMPILER (use CMAKE_C_COMPILER)
# BINARY32
# BINARY64
# FU
# CROSS_SUFFIX
# CROSS
# CEXTRALIB
# Defines set by this file:
# OS_
# ARCH_
# C_
# __32BIT__
# __64BIT__
# FUNDERSCORE
# PTHREAD_CREATE_FUNC
# N.B. c_check (and ctest.c) is not cross-platform, so instead try to use CMake variables.
set(FU "")
if(APPLE)
set(FU "_")
elseif(MSVC)
set(FU "_")
elseif(UNIX)
set(FU "")
endif()
# Convert CMake vars into the format that OpenBLAS expects
string(TOUPPER ${CMAKE_SYSTEM_NAME} HOST_OS)
if (${HOST_OS} STREQUAL "WINDOWS")
set(HOST_OS WINNT)
endif ()
# added by hpa - check size of void ptr to detect 64-bit compile
if (NOT DEFINED BINARY)
set(BINARY 32)
if (CMAKE_SIZEOF_VOID_P EQUAL 8)
set(BINARY 64)
endif ()
endif ()
if (BINARY EQUAL 64)
set(BINARY64 1)
else ()
set(BINARY32 1)
endif ()
# CMake docs define these:
# CMAKE_SYSTEM_PROCESSOR - The name of the CPU CMake is building for.
# CMAKE_HOST_SYSTEM_PROCESSOR - The name of the CPU CMake is running on.
#
# TODO: CMAKE_SYSTEM_PROCESSOR doesn't seem to be correct - instead get it from the compiler a la c_check
set(ARCH ${CMAKE_SYSTEM_PROCESSOR})
if (${ARCH} STREQUAL "AMD64")
set(ARCH "x86_64")
endif ()
# If you are using a 32-bit compiler on a 64-bit system CMAKE_SYSTEM_PROCESSOR will be wrong
if (${ARCH} STREQUAL "x86_64" AND BINARY EQUAL 32)
set(ARCH x86)
endif ()
if (${ARCH} STREQUAL "X86")
set(ARCH x86)
endif ()
set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU")
set(COMPILER_ID "GCC")
endif ()
string(TOUPPER ${ARCH} UC_ARCH)
file(WRITE ${TARGET_CONF}
"#define OS_${HOST_OS}\t1\n"
"#define ARCH_${UC_ARCH}\t1\n"
"#define C_${COMPILER_ID}\t1\n"
"#define __${BINARY}BIT__\t1\n"
"#define FUNDERSCORE\t${FU}\n")

103
cmake/cc.cmake Normal file
View File

@@ -0,0 +1,103 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets C related variables.
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
set(NO_UNINITIALIZED_WARN "-Wno-uninitialized")
if (QUIET_MAKE)
set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused")
endif ()
if (NO_BINARY_MODE)
if (${ARCH} STREQUAL "mips64")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=n32")
endif ()
set(BINARY_DEFINED 1)
endif ()
if (${CORE} STREQUAL "LOONGSON3A")
set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64")
set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64")
endif ()
if (${CORE} STREQUAL "LOONGSON3B")
set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64")
set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64")
endif ()
if (${OSNAME} STREQUAL "AIX")
set(BINARY_DEFINED 1)
endif ()
endif ()
if (NOT BINARY_DEFINED)
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
endif ()
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
if (${ARCH} STREQUAL "mips64")
if (NOT BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -n32")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -n64")
endif ()
if (${CORE} STREQUAL "LOONGSON3A")
set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static")
endif ()
if (${CORE} STREQUAL "LOONGSON3B")
set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static")
endif ()
else ()
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
endif ()
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "SUN")
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
if (${ARCH} STREQUAL "x86")
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
endif ()
endif ()

60
cmake/export.cmake Normal file
View File

@@ -0,0 +1,60 @@
#Only generate .def for dll on MSVC
if(MSVC)
set_source_files_properties(${OpenBLAS_DEF_FILE} PROPERTIES GENERATED 1)
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
set(ARCH_IN ${ARCH})
endif()
if (${CORE} STREQUAL "generic")
set(ARCH_IN "GENERIC")
endif ()
if (NOT DEFINED EXPRECISION)
set(EXPRECISION_IN 0)
else()
set(EXPRECISION_IN ${EXPRECISION})
endif()
if (NOT DEFINED NO_CBLAS)
set(NO_CBLAS_IN 0)
else()
set(NO_CBLAS_IN ${NO_CBLAS})
endif()
if (NOT DEFINED NO_LAPACK)
set(NO_LAPACK_IN 0)
else()
set(NO_LAPACK_IN ${NO_LAPACK})
endif()
if (NOT DEFINED NO_LAPACKE)
set(NO_LAPACKE_IN 0)
else()
set(NO_LAPACKE_IN ${NO_LAPACKE})
endif()
if (NOT DEFINED NEED2UNDERSCORES)
set(NEED2UNDERSCORES_IN 0)
else()
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
endif()
if (NOT DEFINED ONLY_CBLAS)
set(ONLY_CBLAS_IN 0)
else()
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
endif()
add_custom_command(
TARGET ${OpenBLAS_LIBNAME} PRE_LINK
COMMAND perl
ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
COMMENT "Create openblas.def file"
VERBATIM)
endif()

66
cmake/f_check.cmake Normal file
View File

@@ -0,0 +1,66 @@
##
## Author: Hank Anderson <hank@statease.com>
## Copyright: (c) Stat-Ease, Inc.
## Created: 12/29/14
## Last Modified: 12/29/14
## Description: Ported from the OpenBLAS/f_check perl script.
## This is triggered by prebuild.cmake and runs before any of the code is built.
## Appends Fortran information to config.h and Makefile.conf.
# CMake vars set by this file:
# F_COMPILER
# FC
# BU
# NOFORTRAN
# NEED2UNDERSCORES
# FEXTRALIB
# Defines set by this file:
# BUNDERSCORE
# NEEDBUNDERSCORE
# NEED2UNDERSCORES
if (MSVC)
# had to do this for MSVC, else CMake automatically assumes I have ifort... -hpa
include(CMakeForceCompiler)
CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)
endif ()
if (NOT NO_LAPACK)
enable_language(Fortran)
else()
include(CMakeForceCompiler)
CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)
endif()
if (NOT ONLY_CBLAS)
# N.B. f_check is not cross-platform, so instead try to use CMake variables
# run f_check (appends to TARGET files)
# message(STATUS "Running f_check...")
# execute_process(COMMAND perl f_check ${TARGET_MAKE} ${TARGET_CONF} ${CMAKE_Fortran_COMPILER}
# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
# TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile
# TODO: set FEXTRALIB flags a la f_check?
set(BU "_")
file(APPEND ${TARGET_CONF}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n"
"#define NEED2UNDERSCORES 0\n")
else ()
#When we only build CBLAS, we set NOFORTRAN=2
set(NOFORTRAN 2)
set(NO_FBLAS 1)
#set(F_COMPILER GFORTRAN) # CMake handles the fortran compiler
set(BU "_")
file(APPEND ${TARGET_CONF}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n")
endif()
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
string(TOUPPER ${F_COMPILER} F_COMPILER)

Some files were not shown because too many files have changed in this diff Show More